From 9afd31e66ddbe6ecb2206c4d2edf72b7a653702c Mon Sep 17 00:00:00 2001 From: Damian Stachura Date: Wed, 11 Feb 2026 18:25:16 +0100 Subject: [PATCH 1/2] Migrated HELM leaderboards into schema v0.2 --- ...bd982107-7c03-4ee8-8a38-782d68883818.json} | 90 +- ...25aa6e41-ab16-4f63-9613-bfb83b9151c5.json} | 90 +- ...ddd52881-1248-4652-9f1d-5d2b58ede889.json} | 90 +- ...365bc693-73b6-41fe-a8fa-eba7b91febe0.json} | 90 +- ...a126b881-918a-411a-90e9-32d7b63d1e00.json} | 84 +- ...b8e54bb1-0768-4558-8dc2-4897d4e571aa.json} | 84 +- ...a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json} | 84 +- ...2413b504-7125-461b-ae9d-0c58211a5358.json} | 84 +- ...f350d9d1-b743-4017-bc68-a4dc726515d0.json} | 86 +- ...c32a1f0a-bf8a-42be-b155-4f87465235bc.json} | 86 +- ...96cfde1b-77de-4d2a-8b45-938116795108.json} | 86 +- .../56c180e5-45aa-4106-8f92-c6566c3c7dfc.json | 345 +++++++ ...d633fcd6-eb01-49ff-ba7c-6ca12734746f.json} | 86 +- ...7a7b49ff-5060-4d12-acb9-607125fbe081.json} | 86 +- ...287a3646-d969-4bd9-9773-86463c1ba87f.json} | 86 +- ...97f3892f-9588-49ef-abef-3a0c965bb352.json} | 86 +- .../22ba68b0-6eec-47f2-b465-47f298e8da09.json | 345 +++++++ ...9e5684dc-6380-4353-b966-7205d66340fa.json} | 84 +- ...1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json} | 84 +- ...20512a3b-ac0f-483a-8bec-9962980c579c.json} | 86 +- ...704c5c74-a0ee-457d-9b4e-3ae895ffc105.json} | 86 +- ...eb9224b8-0edb-4605-a2ee-cfb63f41370e.json} | 84 +- ...4cb58f80-c2b1-45c6-b781-19af47660eb0.json} | 86 +- ...6307e0c4-c983-4257-82d8-b2a50171eb8a.json} | 84 +- ...275cd615-bddf-4afe-a499-b463fe183486.json} | 86 +- ...03b48360-a387-44ba-94b2-2eb7c234a9fa.json} | 86 +- .../3a242fb8-07f9-460e-93eb-345aab0f994f.json | 345 +++++++ ...5e5720d0-67fe-40a9-b65b-d4154848d83c.json} | 84 +- .../9c9239df-0cbb-411f-af40-1b2782f91255.json | 345 +++++++ .../e1d12d96-185f-493e-bb08-8237623fb736.json | 345 +++++++ ...aba1fded-b031-48df-87ef-dc744df33501.json} | 90 +- ...98f69aa6-b227-4076-a76e-1293cbe1c6cb.json} | 86 +- ...d2bb087e-a275-4fce-b6dc-001fd4545883.json} | 86 +- ...84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json} | 86 +- ...23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json} | 86 +- ...9cab3a77-4f32-48d0-ba11-e2323ccc4861.json} | 86 +- ...9e037c92-1253-49be-b31a-3aa017531d77.json} | 86 +- ...bd26c7cb-ce76-4b17-b617-d1d93a168c93.json} | 86 +- ...9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json} | 86 +- ...d69a1cbe-353c-4be9-b93b-5224d24c7adf.json} | 86 +- ...915cb39d-f21f-4ef1-a95f-f44f79ede893.json} | 86 +- ...fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json} | 84 +- ...eb51f418-6abf-4b2c-9f57-0b830c00bd15.json} | 86 +- ...41cd14b0-46ba-49da-844a-19fe866bef1e.json} | 86 +- ...7de93642-a4bc-430b-8733-9befeb6a0e23.json} | 86 +- ...4f18292a-1fef-4feb-9b17-045c96e3e137.json} | 86 +- ...7458c032-b24d-4f13-a659-b6e19d19a8e1.json} | 86 +- ...21eb1648-aad0-4297-9edc-c445e4c38694.json} | 86 +- ...99d657ae-e850-4caf-a599-13f1b8072273.json} | 86 +- ...10cd766e-442c-4b3d-833b-740417d9a6d9.json} | 86 +- .../bc6124a7-89df-4c3e-b824-56c948d1eeb5.json | 345 +++++++ ...06719cd4-5654-49b6-9dee-e112d1601d1c.json} | 84 +- ...ed849999-48c2-4569-8bcd-dc73084e3197.json} | 84 +- ...01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json} | 86 +- ...32382d69-21c7-43a9-bb95-27607ec18cc9.json} | 86 +- ...77e702f7-37ef-4487-b047-74b13ef6d966.json} | 86 +- ...4ee3c647-740c-41a6-ac66-4a38b09317ff.json} | 86 +- ...ca30726a-00a6-4228-94fe-5dce00de1d5e.json} | 84 +- ...7862890a-298b-4bda-b8f1-7be6a5779365.json} | 84 +- .../8c73a09f-ba0d-4c12-a12a-776a17292151.json | 345 +++++++ ...442aed0d-95c3-4436-ad63-b7b1e93307f4.json} | 84 +- ...7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json} | 84 +- ...bc2c91e0-6afd-4e44-b665-d5c7558f8981.json} | 84 +- ...a74b74f7-ccce-4341-a122-26728cc6bece.json} | 84 +- ...87811b75-afe8-413b-949d-7fd1f582a2e8.json} | 84 +- ...ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json} | 84 +- ...924080a0-c530-4e6d-b1a4-107de3bd7183.json} | 86 +- ...be23c720-a99a-4945-bc0b-ddc27c8eec39.json} | 84 +- ...425d4a41-2def-4581-9b61-ee33ecb3a822.json} | 179 +++- ...c12a8494-bafc-4097-874a-7c00636e96f8.json} | 175 +++- ...4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json} | 175 +++- ...19f61327-fcc3-408f-9254-2d6a2aadcd4e.json} | 175 +++- ...ccc17d56-bd26-409c-ac3f-262eaba9ce21.json} | 175 +++- ...f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json} | 175 +++- ...9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json} | 175 +++- ...f25c142c-8730-4241-a649-01d076e1f28d.json} | 175 +++- ...ab34f23e-36db-40c0-9681-f30b00692f98.json} | 175 +++- ...67281534-a03d-49d8-a586-25cb1a03134e.json} | 175 +++- ...3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json} | 175 +++- ...04ce2ba4-c382-4658-ba06-1def9499a243.json} | 175 +++- ...3a546396-d031-4958-8410-00e0d3406089.json} | 175 +++- ...e7b99aa6-08e8-4224-a805-16586eb44325.json} | 175 +++- ...43a3fe19-929a-463d-a0ed-791dad765188.json} | 175 +++- ...75468958-b75b-41fe-9813-070b793e86d9.json} | 175 +++- ...6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json} | 175 +++- ...3c9c425a-ce4a-4958-9744-7f9490ed5729.json} | 175 +++- ...5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json} | 175 +++- ...8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json} | 175 +++- ...f8044c74-3f1c-4562-a21c-e448061b2077.json} | 175 +++- ...4abe3a0d-ba04-41f7-b107-59f11ff5697a.json} | 179 +++- ...646adb7b-0761-4639-8776-83ea158bfca4.json} | 179 +++- ...85cf6be2-d066-4e1b-b373-d53d3c922184.json} | 179 +++- ...52db5c6d-b54e-401a-880d-8ab41a394bc4.json} | 175 +++- ...68becad6-9455-4d3d-8d68-d1b4448598a1.json} | 175 +++- ...519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json} | 175 +++- ...972bc5db-f536-42f9-aa51-83cc2f59b76a.json} | 175 +++- ...b2220101-56e0-49d9-a3d1-d3bec769ab97.json} | 175 +++- ...96907b25-05c3-441b-afc4-69274c20bfc3.json} | 175 +++- ...66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json} | 175 +++- ...70e9e156-6807-489b-b77a-367236614826.json} | 175 +++- ...e90cfb46-1173-4d22-9329-9bf57cdd5241.json} | 175 +++- ...baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json} | 175 +++- ...7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json} | 175 +++- ...ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json} | 175 +++- ...26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json} | 175 +++- ...ecd21c26-cdc4-43b1-b933-4d970df9413a.json} | 175 +++- ...9d4350eb-cdf0-432f-b3b0-45f4832ca950.json} | 175 +++- ...3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json} | 179 +++- ...b277c87e-54b5-466f-97d7-35db4cd7b985.json} | 175 +++- ...270df23b-9e58-4259-a8ed-0d25b9c80b2a.json} | 175 +++- ...1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json} | 179 +++- ...ef171b67-72a6-46d3-9eaf-4614ff474852.json} | 179 +++- ...e6ea5f7e-0533-4a99-8638-1cc10c454238.json} | 175 +++- ...83c924fe-6318-4bad-adb0-8a81e5e28ee0.json} | 175 +++- ...82e2c0e3-66f2-431f-b4b8-d2495970d998.json} | 175 +++- ...6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json} | 175 +++- ...e18fbf9e-677c-49fb-ab76-475e8f605f01.json} | 175 +++- ...039af363-0c5c-4e36-8396-cd57c7e4c1de.json} | 175 +++- ...8ea1facb-260a-461d-9271-2c07b318c46f.json} | 175 +++- ...93007ac9-04c2-451d-abd2-2f235297747e.json} | 175 +++- ...b04e5f90-e46e-4d7a-a6a9-569bde072208.json} | 175 +++- ...933dc76f-45f0-48e0-93ae-3e19cff87c2a.json} | 175 +++- ...b8408a64-eb89-4337-8ee5-3c48e4e24437.json} | 175 +++- ...d5846321-0800-4ff9-b85c-53c8b4884ba5.json} | 175 +++- ...baa5f92c-b626-4e09-a084-61ce7f5dee98.json} | 179 +++- ...9b648e90-8d3c-403d-9ad8-382ef0b212a6.json} | 179 +++- ...0692f762-337e-4c20-8ad6-feecc93882a3.json} | 179 +++- ...a91c9563-0756-4616-8a58-3c8000f73895.json} | 179 +++- ...3a329574-dcf6-4177-b37c-c495e6af6cc5.json} | 175 +++- ...9e662c1e-e77c-4fb3-b589-127683a4b2ca.json} | 175 +++- ...375140f6-bd3f-4b55-a35c-23de37254296.json} | 175 +++- ...021d0b25-8f58-47da-a58c-ac532a7972bf.json} | 175 +++- ...9207fec4-d0c4-4f66-b917-f5ed57409215.json} | 175 +++- ...b04c8845-cccf-4856-9597-ab283bb2ec8d.json} | 175 +++- ...4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json} | 175 +++- ...0e30e895-aaf7-42d4-95db-7541d6b41c87.json} | 183 ++-- ...4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json} | 185 ++-- ...8befd29c-a16d-4e05-a92f-00b621d45e03.json} | 185 ++-- ...b2e193b8-215b-4e80-9d5a-df11f1dac88a.json} | 185 ++-- ...eedd0f38-6d26-4297-a469-291227ec6be6.json} | 184 +++- ...74c47665-740f-4784-8a27-1c1d1c29bff8.json} | 184 +++- ...8027b577-7f48-4df5-9879-bd45ac342f42.json} | 184 +++- ...e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json} | 184 +++- ...24e11e7b-15d6-4a09-9545-38486d0eb236.json} | 184 +++- ...eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json} | 184 +++- ...52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json} | 184 +++- ...68713712-ae92-474b-84c0-1b8301538439.json} | 184 +++- ...15cc9411-6ea4-4f10-831f-23ff27fd5704.json} | 182 +++- ...3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json} | 182 +++- ...1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json} | 180 +++- ...078d812b-2198-4497-8fbe-06fb640fd86d.json} | 184 +++- ...f928a53d-9d67-45e7-a871-04359c8162d5.json} | 184 +++- ...741c4560-eb35-4edf-a48b-af29e743740a.json} | 184 +++- ...4e8a8384-5f1d-4b76-be9b-385407332d6c.json} | 184 +++- ...0684c1d2-ea43-4341-820c-09051f5e11f2.json} | 182 +++- ...51821ca1-7eac-4094-abac-98b2484cc5a0.json} | 182 +++- ...8a0f5749-7f6a-4813-9c08-7283433c1337.json} | 186 ++-- ...4697983d-a29a-484d-9268-7974117456e8.json} | 184 +++- ...60e33aa3-0593-42e6-9baa-8311746deca0.json} | 184 +++- ...2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json} | 184 +++- ...9ad91ee2-7a64-4f94-9166-f2681777023b.json} | 184 +++- ...4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json} | 184 +++- ...64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json} | 182 +++- ...fe8a36b0-4361-461b-b310-656c54131fa6.json} | 182 +++- ...b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json} | 182 +++- ...67967a2a-5fb4-46e8-b1ec-eda1588d9086.json} | 182 +++- ...0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json} | 182 +++- ...ba5eea81-2120-4a20-8322-dfbd29cd197c.json} | 182 +++- ...9dd66ede-da5c-4627-92ed-7057c9a2bea3.json} | 182 +++- ...801aa7da-90b2-48d1-ad3d-943b06bd437c.json} | 184 +++- ...a58923ea-fa22-4c45-8327-efbe84c8a05d.json} | 182 +++- ...bab8d241-fad0-4230-b213-c2eeccc79f12.json} | 184 +++- ...65e37589-ef26-46cd-a627-798af70e75bf.json} | 184 +++- ...f499f9c6-4c9a-43ba-b4c3-d094494a371c.json} | 184 +++- ...27a54446-57b2-4239-b768-7ab85dc94c54.json} | 184 +++- ...5de8a13e-a029-4a90-9a2d-c28a59212140.json} | 184 +++- ...f9643ce2-7347-401b-903e-fadcc5221f36.json} | 186 ++-- ...9932e430-2039-40b0-bc8f-ae2d833543e8.json} | 184 +++- ...dbd2e9bb-c2ca-4165-b229-d736a70721a5.json} | 184 +++- ...32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json} | 184 +++- ...70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json} | 184 +++- ...07a367ee-2879-4ede-bbf8-33b24d682467.json} | 184 +++- ...fee914c7-d6bf-4d61-9f50-71bae5f11006.json} | 184 +++- ...b0577066-231e-461b-bae8-b724b204397a.json} | 184 +++- ...b79fe2e3-5eec-46f8-90a1-810781c8c46a.json} | 184 +++- ...998616ef-5d1b-4c65-b6ad-23afc3630d5a.json} | 184 +++- ...fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json} | 184 +++- ...25fde5e6-86b8-4a80-8f79-5946ef9999fc.json} | 184 +++- ...b955825d-ae7f-48c4-9dad-5ee78879737d.json} | 184 +++- ...168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json} | 184 +++- ...0807e353-9787-4ca0-8f7b-50d1bed2469e.json} | 184 +++- ...0164b885-2c27-4eba-8e6f-e69156cb0dee.json} | 184 +++- ...08422837-51a0-45c9-9004-fc5d98dce462.json} | 184 +++- ...39f2c7f2-56d4-4349-95ae-374d34263f48.json} | 184 +++- ...0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json} | 182 +++- ...75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json} | 184 +++- ...2de4b89a-3f3b-4d1d-ba85-030953a46956.json} | 184 +++- ...bd68405f-fe9a-448b-9c80-468c656594e5.json} | 184 +++- ...4267fef1-3180-46e3-990e-0d1092ec4c18.json} | 184 +++- ...002a34dc-39e5-451d-b2a8-b51bdb69a056.json} | 184 +++- ...5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json} | 184 +++- ...ad2beded-cec3-4b47-b8de-a32a3225fa66.json} | 184 +++- ...eb901347-fc1f-4d8f-a70a-05a83e16658d.json} | 184 +++- ...9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json} | 184 +++- ...042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json} | 184 +++- ...d2d48e4a-0484-4f44-8108-2e689d7ca695.json} | 184 +++- ...e54ae605-a91d-47d7-a08d-67bd0ea5c606.json} | 184 +++- ...15dccf75-871d-457b-8495-e0d03d550360.json} | 184 +++- ...18fe5d30-bf36-405a-819e-1ecabda327ea.json} | 184 +++- ...cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json} | 184 +++- ...cd199905-04a4-4745-b848-4f7bde97ca17.json} | 184 +++- ...1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json} | 184 +++- ...bfd70aff-bf45-4f55-b730-4924afc181cd.json} | 184 +++- ...b6e08679-1bd7-42a1-9eee-98252de2c7c1.json} | 184 +++- ...22b411d5-a314-4b17-a9c7-c1af7ca7df33.json} | 184 +++- ...f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json} | 184 +++- ...fb1bb023-16f6-4914-889b-6458d7ab1277.json} | 184 +++- ...8b572c10-3553-4e51-a321-bdb05996914b.json} | 184 +++- ...6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json} | 184 +++- ...e0efe169-d28e-418e-a78c-9b04ec29aae2.json} | 184 +++- ...05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json} | 184 +++- ...983696ae-d7f3-48a4-b7a0-a42487728182.json} | 186 ++-- ...a969e516-adef-4839-9252-244c58ab3c67.json} | 186 ++-- ...f122f9de-b1ce-40ea-8731-6c00c7af0498.json} | 182 +++- ...5c7982c5-3513-4ff2-9857-33a0db825376.json} | 184 +++- ...4910859a-750c-4728-bf30-309e0e81690e.json} | 184 +++- ...32f0532f-b504-492d-84d7-f541930edad0.json} | 182 +++- ...04c187a3-4532-4523-b39d-19314d61c779.json} | 190 ++-- ...4440532c-9b49-4c9a-8bf4-f122531c54fa.json} | 184 +++- ...bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json} | 184 +++- ...3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json} | 850 ++++++++++++------ ...6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json} | 850 ++++++++++++------ ...3d0b3d68-a853-4989-a35e-83ac6722c2da.json} | 850 ++++++++++++------ ...ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json} | 848 +++++++++++------ ...517e8027-6edd-482b-86f3-33b6c41a9609.json} | 848 +++++++++++------ ...f7c1c125-ad0f-4847-b880-4f705f1666c6.json} | 848 +++++++++++------ ...5a0ba280-8a12-4735-9d92-4ed71ba395b4.json} | 850 ++++++++++++------ ...73ccc6a6-e10d-4619-914f-26032cddf8da.json} | 850 ++++++++++++------ ...20c5af59-ff73-4731-9230-f92bb86e657b.json} | 848 +++++++++++------ ...fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json} | 848 +++++++++++------ ...d30617fc-8d64-4070-b86a-c982025cfcea.json} | 848 +++++++++++------ ...aa8cae95-cb75-4241-951c-25e2046042dd.json} | 848 +++++++++++------ ...c88e4a03-22ae-4338-bf5f-36070814136a.json} | 850 ++++++++++++------ ...4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json} | 850 ++++++++++++------ ...ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json} | 850 ++++++++++++------ ...097a8da1-f411-4359-8440-2ab06f4ae76c.json} | 850 ++++++++++++------ ...68130abd-1df5-4cd3-919a-2863e9f013c7.json} | 850 ++++++++++++------ ...5d8d795a-d213-4b96-9b17-ad5fae6b3687.json} | 850 ++++++++++++------ ...7908da03-f030-4c62-a121-c04bd94ea75e.json} | 848 +++++++++++------ ...c6fdbf96-2500-4410-8fcd-268ea3e16062.json} | 848 +++++++++++------ ...537164c3-7b88-4543-b19d-370f55a25a66.json} | 848 +++++++++++------ ...0c539e26-8403-42db-acfc-7953dd80ae20.json} | 848 +++++++++++------ ...364c7490-8bb1-4e7e-b485-fb3c2224da58.json} | 850 ++++++++++++------ ...1a9167d2-882c-4582-b4e0-ac425896a317.json} | 848 +++++++++++------ ...8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json} | 850 ++++++++++++------ ...d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json} | 850 ++++++++++++------ ...a94c9e13-dca7-4e02-a795-09d9274354d3.json} | 850 ++++++++++++------ ...75c8b20f-a4d4-4699-be79-f027bf7f0d69.json} | 850 ++++++++++++------ ...264be7b4-08b7-40b6-a5e7-f3536f361450.json} | 850 ++++++++++++------ ...83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json} | 850 ++++++++++++------ ...8a013eb3-0f21-4a50-8a53-4ba977951130.json} | 850 ++++++++++++------ ...7b081a40-7cb6-4405-b842-3db95f290dfa.json} | 850 ++++++++++++------ ...54185b53-9891-43c6-8f93-09ff02b728d8.json} | 850 ++++++++++++------ ...884c194d-6519-4bd4-8add-6514e593c514.json} | 850 ++++++++++++------ ...a80cbd76-bcf8-4174-b0b3-346fae152bdb.json} | 850 ++++++++++++------ ...5f105986-aa7d-4858-91bc-cece9d0085ba.json} | 850 ++++++++++++------ ...528b7b4e-c8a6-4387-bd98-497a3316029d.json} | 850 ++++++++++++------ ...96eb34db-66bd-4945-8b4c-a8c1394fe56a.json} | 850 ++++++++++++------ ...961e917b-0e67-462c-b9d0-0fe4b4b85beb.json} | 850 ++++++++++++------ ...59a85d2c-16ce-4ed4-bc65-f6898127fa57.json} | 850 ++++++++++++------ ...16a8b446-51fc-4c23-9231-46ee16c1c0a8.json} | 850 ++++++++++++------ ...f4de7e58-7060-440b-8f6f-1f79d7499d1e.json} | 850 ++++++++++++------ ...5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json} | 850 ++++++++++++------ ...dc6aa933-67e4-4811-b3e2-e5200c002abe.json} | 850 ++++++++++++------ ...5f9758a3-fd6d-4598-930a-9c01420d05e8.json} | 850 ++++++++++++------ ...7592c0d8-a06c-4189-81a1-dbf794d22c8b.json} | 850 ++++++++++++------ ...83c0e8e3-087c-4d61-9153-e571b4971871.json} | 850 ++++++++++++------ ...c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json} | 850 ++++++++++++------ ...5baac093-babb-41cd-a2f4-985d0b91be37.json} | 848 +++++++++++------ ...1bf54088-ba12-45b4-8f80-63d5c38f58f6.json} | 850 ++++++++++++------ ...5ed0a970-200f-4f23-9623-e714afa49ddf.json} | 850 ++++++++++++------ ...e7fd06a6-65e5-4f88-8e86-c513f78e31db.json} | 850 ++++++++++++------ ...ac047aef-008f-4c87-a6d5-4f331ebf5c53.json} | 850 ++++++++++++------ ...ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json} | 850 ++++++++++++------ ...7517b6c9-c613-416c-aadb-39fd6d252da7.json} | 850 ++++++++++++------ ...85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json} | 850 ++++++++++++------ ...df568c3c-8a5c-4455-836d-c980d7f5ea5c.json} | 850 ++++++++++++------ ...96e24977-ca6d-402c-bfd8-62be4cd9b902.json} | 850 ++++++++++++------ ...e5b2636a-8438-40c0-9f89-9f35585bf740.json} | 850 ++++++++++++------ ...f3259d92-3c95-4b78-81ae-f7f4b80aec63.json} | 850 ++++++++++++------ ...5ba23a34-4232-487f-b3e9-326d776135be.json} | 850 ++++++++++++------ ...5bc1a462-f753-4259-91c3-a549491b2986.json} | 850 ++++++++++++------ ...16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json} | 850 ++++++++++++------ ...dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json} | 850 ++++++++++++------ ...2ca11d4c-52e6-49ea-a5cb-238c0313c483.json} | 850 ++++++++++++------ ...de400624-6c2e-47af-b851-54c4075c30ee.json} | 850 ++++++++++++------ ...34441b3b-4d66-444c-af85-ca0666a48ed4.json} | 850 ++++++++++++------ ...eecf5e40-9110-47ea-a72b-9ba587b96e30.json} | 850 ++++++++++++------ ...f26fb123-c214-4d18-aea8-b05b4ea1819b.json} | 850 ++++++++++++------ ...30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json} | 850 ++++++++++++------ ...b152cd5c-cbc0-48f4-ba37-16878c3afba1.json} | 850 ++++++++++++------ ...dac223e9-3073-46f9-924b-c5a6408f5da9.json} | 850 ++++++++++++------ ...a7a218ff-7afe-417c-ac39-cf305d592d56.json} | 850 ++++++++++++------ ...2e165735-43b8-4317-9cde-35aa4b5bcb26.json} | 850 ++++++++++++------ ...15c25bc5-7b1e-4771-bda2-fd04d74e1463.json} | 850 ++++++++++++------ ...26036c7c-e981-46e8-b5e9-dcd7d116af70.json} | 848 +++++++++++------ ...b3269e4e-98a7-4795-8ef3-fc87774a54b7.json} | 848 +++++++++++------ ...284fde9f-8570-4e6d-9190-e52d8723fe57.json} | 848 +++++++++++------ ...fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json} | 850 ++++++++++++------ scripts/HELM/parse_helm_leaderboards.sh | 9 + utils/helm/adapter.py | 56 +- 311 files changed, 75832 insertions(+), 28221 deletions(-) rename data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/{18881f8b-b06e-4317-b697-6eadb975077c.json => bd982107-7c03-4ee8-8a38-782d68883818.json} (80%) rename data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/{97db1a8d-b7d8-4481-82fb-dc0c6396edac.json => 25aa6e41-ab16-4f63-9613-bfb83b9151c5.json} (80%) rename data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/{8d29f447-01d8-4fae-87d5-b4386ce5239a.json => ddd52881-1248-4652-9f1d-5d2b58ede889.json} (80%) rename data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/{53090373-ea82-4b63-83fd-f1d48f0400cd.json => 365bc693-73b6-41fe-a8fa-eba7b91febe0.json} (80%) rename data/helm_capabilities/amazon/{nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json => nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json} (80%) rename data/helm_capabilities/amazon/{nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json => nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json} (80%) rename data/helm_capabilities/amazon/{nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json => nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json} (81%) rename data/helm_capabilities/amazon/{nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json => nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json} (80%) rename data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/{568969ac-4b9a-42b0-8374-2b28dde30a3c.json => f350d9d1-b743-4017-bc68-a4dc726515d0.json} (80%) rename data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/{c6b92f00-6335-463d-87db-817ff85f36c8.json => c32a1f0a-bf8a-42be-b155-4f87465235bc.json} (80%) rename data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/{460fdbd2-a164-4af4-95ff-db66e381ca0c.json => 96cfde1b-77de-4d2a-8b45-938116795108.json} (80%) create mode 100644 data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json rename data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/{cb21169b-04ff-47d1-92dd-5b5f2e09b863.json => d633fcd6-eb01-49ff-ba7c-6ca12734746f.json} (80%) rename data/helm_capabilities/anthropic/claude-opus-4-20250514/{2168d830-ad6b-4aee-94f0-7ec8fd403a49.json => 7a7b49ff-5060-4d12-acb9-607125fbe081.json} (80%) rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/{a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json => 287a3646-d969-4bd9-9773-86463c1ba87f.json} (80%) rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514/{629d5de7-25ed-4088-aca6-7fb53719f4a4.json => 97f3892f-9588-49ef-abef-3a0c965bb352.json} (80%) create mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json rename data/helm_capabilities/deepseek-ai/deepseek-r1-0528/{fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json => 9e5684dc-6380-4353-b966-7205d66340fa.json} (81%) rename data/helm_capabilities/deepseek-ai/deepseek-v3/{d031935b-2b54-4940-a852-dad1f10fc396.json => 1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json} (81%) rename data/helm_capabilities/google/gemini-1.5-flash-002/{b79010aa-d441-4850-b656-52ce6587dab8.json => 20512a3b-ac0f-483a-8bec-9962980c579c.json} (80%) rename data/helm_capabilities/google/gemini-1.5-pro-002/{dde5a36d-f14b-482d-86db-74bdb3771e38.json => 704c5c74-a0ee-457d-9b4e-3ae895ffc105.json} (80%) rename data/helm_capabilities/google/gemini-2.0-flash-001/{981ba423-a1d2-4577-9f61-9c4b8b430b58.json => eb9224b8-0edb-4605-a2ee-cfb63f41370e.json} (81%) rename data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/{56ddcce9-fc1c-476f-96c8-65a7d732c95b.json => 4cb58f80-c2b1-45c6-b781-19af47660eb0.json} (80%) rename data/helm_capabilities/google/gemini-2.5-flash-lite/{22da4909-8b3b-49f3-940f-8764509725f8.json => 6307e0c4-c983-4257-82d8-b2a50171eb8a.json} (81%) rename data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/{a6b3d596-d204-4cb7-a3e4-4e717537b76a.json => 275cd615-bddf-4afe-a499-b463fe183486.json} (80%) rename data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/{eaa18be0-1195-4344-9673-efa8c555456d.json => 03b48360-a387-44ba-94b2-2eb7c234a9fa.json} (80%) create mode 100644 data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json rename data/helm_capabilities/ibm/granite-3.3-8b-instruct/{0ae30d3c-098c-434f-985b-58e8179148a6.json => 5e5720d0-67fe-40a9-b65b-d4154848d83c.json} (81%) create mode 100644 data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json create mode 100644 data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json rename data/helm_capabilities/marin-community/marin-8b-instruct/{cc90bae5-b964-4402-9edb-5427663f01fb.json => aba1fded-b031-48df-87ef-dc744df33501.json} (80%) rename data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/{2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json => 98f69aa6-b227-4076-a76e-1293cbe1c6cb.json} (80%) rename data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/{9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json => d2bb087e-a275-4fce-b6dc-001fd4545883.json} (80%) rename data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/{930db2c4-d9c5-4e38-ae80-7304c2f10611.json => 84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json} (80%) rename data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/{226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json => 23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json} (80%) rename data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/{bb4e408d-505e-46c8-bd0c-7afa44a96498.json => 9cab3a77-4f32-48d0-ba11-e2323ccc4861.json} (80%) rename data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/{d63dad7a-f7b7-4c87-9712-3043fc117545.json => 9e037c92-1253-49be-b31a-3aa017531d77.json} (80%) rename data/helm_capabilities/mistralai/mistral-large-2411/{7e7f739e-9363-4c41-871d-6cf6c4145728.json => bd26c7cb-ce76-4b17-b617-d1d93a168c93.json} (81%) rename data/helm_capabilities/mistralai/mistral-small-2503/{853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json => 9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json} (80%) rename data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/{b05befca-44a5-45fb-823e-84bcc3ae81d0.json => d69a1cbe-353c-4be9-b93b-5224d24c7adf.json} (80%) rename data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/{2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json => 915cb39d-f21f-4ef1-a95f-f44f79ede893.json} (80%) rename data/helm_capabilities/moonshotai/kimi-k2-instruct/{eaeab0d7-4418-4699-9774-bc1c6711b3d3.json => fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json} (81%) rename data/helm_capabilities/openai/gpt-4.1-2025-04-14/{c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json => eb51f418-6abf-4b2c-9f57-0b830c00bd15.json} (81%) rename data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/{acaf03fd-9d4b-4fe3-8ffe-88212a786363.json => 41cd14b0-46ba-49da-844a-19fe866bef1e.json} (80%) rename data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/{308d3e1d-a1b9-4722-8333-23b840316e3d.json => 7de93642-a4bc-430b-8733-9befeb6a0e23.json} (80%) rename data/helm_capabilities/openai/gpt-4o-2024-11-20/{84a942b6-2b77-4bc2-859f-6b8d6be93558.json => 4f18292a-1fef-4feb-9b17-045c96e3e137.json} (81%) rename data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/{7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json => 7458c032-b24d-4f13-a659-b6e19d19a8e1.json} (80%) rename data/helm_capabilities/openai/gpt-5-2025-08-07/{cb444c37-e273-4aaf-881e-8a433f630053.json => 21eb1648-aad0-4297-9edc-c445e4c38694.json} (81%) rename data/helm_capabilities/openai/gpt-5-mini-2025-08-07/{7af059e2-b56e-46ed-b699-63e570081f16.json => 99d657ae-e850-4caf-a599-13f1b8072273.json} (81%) rename data/helm_capabilities/openai/gpt-5-nano-2025-08-07/{2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json => 10cd766e-442c-4b3d-833b-740417d9a6d9.json} (80%) create mode 100644 data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json rename data/helm_capabilities/openai/gpt-oss-120b/{e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json => 06719cd4-5654-49b6-9dee-e112d1601d1c.json} (80%) rename data/helm_capabilities/openai/gpt-oss-20b/{acb07214-c0f3-4006-8a3b-23793891a1bf.json => ed849999-48c2-4569-8bcd-dc73084e3197.json} (80%) rename data/helm_capabilities/openai/o3-2025-04-16/{a1c5d581-be98-4e1e-ba14-ca922bfac035.json => 01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json} (80%) rename data/helm_capabilities/openai/o4-mini-2025-04-16/{c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json => 32382d69-21c7-43a9-bb95-27607ec18cc9.json} (80%) rename data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/{f6d74c93-0e96-4fc5-987c-18a79dbde17c.json => 77e702f7-37ef-4487-b047-74b13ef6d966.json} (80%) rename data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/{f96da103-5350-4b1b-b33e-6ced1f1f7815.json => 4ee3c647-740c-41a6-ac66-4a38b09317ff.json} (80%) rename data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/{27bae7f2-92dd-4feb-9050-2d11c6da2d61.json => ca30726a-00a6-4228-94fe-5dce00de1d5e.json} (81%) rename data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/{0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json => 7862890a-298b-4bda-b8f1-7be6a5779365.json} (81%) create mode 100644 data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json rename data/helm_capabilities/writer/palmyra-fin/{39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json => 442aed0d-95c3-4436-ad63-b7b1e93307f4.json} (80%) rename data/helm_capabilities/writer/palmyra-med/{1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json => 7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json} (80%) rename data/helm_capabilities/writer/palmyra-x-004/{01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json => bc2c91e0-6afd-4e44-b665-d5c7558f8981.json} (80%) rename data/helm_capabilities/writer/palmyra-x5/{c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json => a74b74f7-ccce-4341-a122-26728cc6bece.json} (80%) rename data/helm_capabilities/xai/grok-3-beta/{24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json => 87811b75-afe8-413b-949d-7fd1f582a2e8.json} (80%) rename data/helm_capabilities/xai/grok-3-mini-beta/{b028eaaf-bc4d-4918-8464-f8c4b0c74973.json => ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json} (80%) rename data/helm_capabilities/xai/grok-4-0709/{c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json => 924080a0-c530-4e6d-b1a4-107de3bd7183.json} (80%) rename data/helm_capabilities/zai-org/glm-4.5-air-fp8/{7b231b0d-89b8-4a0a-825e-ccfea212f565.json => be23c720-a99a-4945-bc0b-ddc27c8eec39.json} (81%) rename data/helm_classic/{anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json => Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json} (92%) rename data/helm_classic/ai21/J1-Grande-v1-17B/{09f5c502-2950-48fb-b25f-b562eeee26c8.json => c12a8494-bafc-4097-874a-7c00636e96f8.json} (92%) rename data/helm_classic/ai21/J1-Grande-v2-beta-17B/{3d13f9ba-b18e-4b52-b28d-9aed0621903d.json => 4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json} (92%) rename data/helm_classic/ai21/J1-Jumbo-v1-178B/{3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json => 19f61327-fcc3-408f-9254-2d6a2aadcd4e.json} (92%) rename data/helm_classic/ai21/J1-Large-v1-7.5B/{1ab7f23a-7527-4188-9141-852f5123eb19.json => ccc17d56-bd26-409c-ac3f-262eaba9ce21.json} (92%) rename data/helm_classic/ai21/Jurassic-2-Grande-17B/{f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json => f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json} (92%) rename data/helm_classic/ai21/Jurassic-2-Jumbo-178B/{ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json => 9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json} (92%) rename data/helm_classic/ai21/Jurassic-2-Large-7.5B/{67114722-a441-478b-a324-2c32be7e06a7.json => f25c142c-8730-4241-a649-01d076e1f28d.json} (91%) rename data/helm_classic/aleph-alpha/Luminous-Base-13B/{07fa437f-398d-48ab-a74d-b8c59caf3add.json => ab34f23e-36db-40c0-9681-f30b00692f98.json} (92%) rename data/helm_classic/aleph-alpha/Luminous-Extended-30B/{7492964a-2c16-4261-aaca-dbcd4f3be7c3.json => 67281534-a03d-49d8-a586-25cb1a03134e.json} (92%) rename data/helm_classic/aleph-alpha/Luminous-Supreme-70B/{b5dace02-416d-4b90-90e1-562b22820784.json => 3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json} (92%) rename data/helm_classic/bigscience/BLOOM-176B/{0e6cd483-dff8-4fba-9239-82cb0fe34d42.json => 04ce2ba4-c382-4658-ba06-1def9499a243.json} (92%) rename data/helm_classic/bigscience/T0pp-11B/{9ae59291-604f-4527-812a-a3150a1098f2.json => 3a546396-d031-4958-8410-00e0d3406089.json} (93%) rename data/helm_classic/cohere/Cohere-Command-beta-52.4B/{52026df3-2452-4fd2-a10b-73a2bfc5397e.json => e7b99aa6-08e8-4224-a805-16586eb44325.json} (92%) rename data/helm_classic/cohere/Cohere-Command-beta-6.1B/{19b97859-5af3-4883-a878-93d026c29d87.json => 43a3fe19-929a-463d-a0ed-791dad765188.json} (92%) rename data/helm_classic/cohere/Cohere-large-v20220720-13.1B/{37af5185-3599-49f5-9637-55d41bc6ae81.json => 75468958-b75b-41fe-9813-070b793e86d9.json} (92%) rename data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/{cf32b49f-7cf8-43a3-9e28-ade7446272ab.json => 6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json} (92%) rename data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/{ad9bd354-01d9-4a21-a299-a53190e1eb7e.json => 3c9c425a-ce4a-4958-9744-7f9490ed5729.json} (92%) rename data/helm_classic/cohere/Cohere-small-v20220720-410M/{12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json => 5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json} (92%) rename data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/{ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json => 8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json} (92%) rename data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/{d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json => f8044c74-3f1c-4562-a21c-e448061b2077.json} (92%) rename data/helm_classic/{eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json => eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json} (91%) rename data/helm_classic/{eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json => eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json} (91%) rename data/helm_classic/{writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json => google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json} (91%) rename data/helm_classic/google/T5-11B/{df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json => 52db5c6d-b54e-401a-880d-8ab41a394bc4.json} (92%) rename data/helm_classic/google/UL2-20B/{ac49ac68-0d7f-4972-bb99-0332b14df2d5.json => 68becad6-9455-4d3d-8d68-d1b4448598a1.json} (92%) rename data/helm_classic/lmsys/Vicuna-v1.3-13B/{39f4648c-6635-4ffa-86f5-040e69f3e054.json => 519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json} (91%) rename data/helm_classic/lmsys/Vicuna-v1.3-7B/{4ef38a9d-283c-4549-8de3-d04ce7f62542.json => 972bc5db-f536-42f9-aa51-83cc2f59b76a.json} (91%) rename data/helm_classic/meta/LLaMA-13B/{81eee874-47be-4a55-af47-5b3e1bcbd361.json => b2220101-56e0-49d9-a3d1-d3bec769ab97.json} (91%) rename data/helm_classic/meta/LLaMA-30B/{2a23b568-daed-4783-9c51-5218216f5f19.json => 96907b25-05c3-441b-afc4-69274c20bfc3.json} (91%) rename data/helm_classic/meta/LLaMA-65B/{584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json => 66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json} (91%) rename data/helm_classic/meta/LLaMA-7B/{6a2445e0-75d4-4434-aabd-645fd445a920.json => 70e9e156-6807-489b-b77a-367236614826.json} (91%) rename data/helm_classic/meta/Llama-2-13B/{f5d57067-8a00-490f-b1bf-30afd0b0f126.json => e90cfb46-1173-4d22-9329-9bf57cdd5241.json} (91%) rename data/helm_classic/meta/Llama-2-70B/{cb8802af-613e-42a1-b025-31532996eb10.json => baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json} (91%) rename data/helm_classic/meta/Llama-2-7B/{ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json => 7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json} (91%) rename data/helm_classic/meta/OPT-175B/{75a5843f-73a4-4ff3-94b5-184152ff703c.json => ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json} (92%) rename data/helm_classic/meta/OPT-66B/{83d19197-aebd-43fa-a7ed-20818a9e5d8e.json => 26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json} (92%) rename data/helm_classic/microsoft/TNLG-v2-530B/{dd121d07-5198-4ac6-81d6-df38485bff25.json => ecd21c26-cdc4-43b1-b933-4d970df9413a.json} (92%) rename data/helm_classic/microsoft/TNLG-v2-6.7B/{f23680f4-8b5a-4baf-9e8d-74f0f4847183.json => 9d4350eb-cdf0-432f-b3b0-45f4832ca950.json} (92%) rename data/helm_classic/{mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json => mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json} (91%) rename data/helm_classic/mosaicml/MPT-30B/{cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json => b277c87e-54b5-466f-97d7-35db4cd7b985.json} (91%) rename data/helm_classic/mosaicml/MPT-Instruct-30B/{182a7373-7ea3-4f2b-b730-af16e20b9fa7.json => 270df23b-9e58-4259-a8ed-0d25b9c80b2a.json} (91%) rename data/helm_classic/{eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json => openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json} (92%) rename data/helm_classic/{eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json => openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json} (92%) rename data/helm_classic/openai/ada-350M/{f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json => e6ea5f7e-0533-4a99-8638-1cc10c454238.json} (94%) rename data/helm_classic/openai/babbage-1.3B/{1c4a54f3-4599-441b-8f30-5e275a0597a7.json => 83c924fe-6318-4bad-adb0-8a81e5e28ee0.json} (94%) rename data/helm_classic/openai/curie-6.7B/{dbefbdbd-b64e-40e9-b632-0dcae3f33913.json => 82e2c0e3-66f2-431f-b4b8-d2495970d998.json} (94%) rename data/helm_classic/openai/davinci-175B/{f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json => 6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json} (94%) rename data/helm_classic/openai/gpt-3.5-turbo-0301/{2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json => e18fbf9e-677c-49fb-ab76-475e8f605f01.json} (91%) rename data/helm_classic/openai/gpt-3.5-turbo-0613/{826d8e72-7332-48b1-af41-537e505c9e11.json => 039af363-0c5c-4e36-8396-cd57c7e4c1de.json} (91%) rename data/helm_classic/openai/text-ada-001/{c34ec087-f3a1-49f1-8ff7-79f353171c4c.json => 8ea1facb-260a-461d-9271-2c07b318c46f.json} (94%) rename data/helm_classic/openai/text-babbage-001/{09763c40-c365-4be9-befc-970ce1886641.json => 93007ac9-04c2-451d-abd2-2f235297747e.json} (94%) rename data/helm_classic/openai/text-curie-001/{4ece7c38-114a-4973-ba13-ac3821c9836f.json => b04e5f90-e46e-4d7a-a6a9-569bde072208.json} (94%) rename data/helm_classic/openai/text-davinci-002/{75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json => 933dc76f-45f0-48e0-93ae-3e19cff87c2a.json} (94%) rename data/helm_classic/openai/text-davinci-003/{0c43aeaf-c7d3-4e00-8b84-5115a6396585.json => b8408a64-eb89-4337-8ee5-3c48e4e24437.json} (94%) rename data/helm_classic/stanford/Alpaca-7B/{d25691b8-37e7-42ff-b59a-8684197280f1.json => d5846321-0800-4ff9-b85c-53c8b4884ba5.json} (91%) rename data/helm_classic/{tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json => tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json} (91%) rename data/helm_classic/{tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json => tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json} (91%) rename data/helm_classic/{tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json => tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json} (91%) rename data/helm_classic/{tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json => tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Base-7B/{8db87a70-babc-4776-8317-70752d3c5546.json => 3a329574-dcf6-4177-b37c-c495e6af6cc5.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/{3da308fb-2403-432e-bde3-3b14af627552.json => 9e662c1e-e77c-4fb3-b589-127683a4b2ca.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Instruct-7B/{fd8f7b08-813c-4369-bfe4-d86eacc874ea.json => 375140f6-bd3f-4b55-a35c-23de37254296.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/{e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json => 021d0b25-8f58-47da-a58c-ac532a7972bf.json} (91%) rename data/helm_classic/writer/InstructPalmyra-30B/{bcf54365-b229-4abf-8ff8-59b4b46fa829.json => 9207fec4-d0c4-4f66-b917-f5ed57409215.json} (91%) rename data/helm_classic/yandex/YaLM-100B/{eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json => b04c8845-cccf-4856-9597-ab283bb2ec8d.json} (91%) rename data/helm_classic/zhipu-ai/GLM-130B/{f45719e5-3334-4e1d-8a83-f5f8292cb977.json => 4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json} (91%) rename data/helm_instruct/anthropic/claude-v1.3/{c4e55239-581b-433f-82bc-68a690f59e4a.json => 0e30e895-aaf7-42d4-95db-7541d6b41c87.json} (61%) rename data/helm_instruct/cohere/command-xlarge-beta/{8a68cccf-2965-4867-b922-460cc5b695de.json => 4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json} (61%) rename data/helm_instruct/openai/gpt-3.5-turbo-0613/{a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json => 8befd29c-a16d-4e05-a92f-00b621d45e03.json} (61%) rename data/helm_instruct/openai/gpt-4-0314/{d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json => b2e193b8-215b-4e80-9d5a-df11f1dac88a.json} (61%) rename data/helm_lite/01-ai/yi-34b/{3b8567cf-40f0-4d63-ad12-9b1712a2c503.json => eedd0f38-6d26-4297-a469-291227ec6be6.json} (82%) rename data/helm_lite/01-ai/yi-6b/{3b94c757-b54d-462c-a2a1-d331711a0833.json => 74c47665-740f-4784-8a27-1c1d1c29bff8.json} (82%) rename data/helm_lite/01-ai/yi-large-preview/{3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json => 8027b577-7f48-4df5-9879-bd45ac342f42.json} (82%) rename data/helm_lite/AlephAlpha/luminous-base/{b4fa23d2-48cd-4a58-b70d-25b466781008.json => e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json} (82%) rename data/helm_lite/AlephAlpha/luminous-extended/{818cfaa1-815b-4a13-b017-5e6c30ed9de3.json => 24e11e7b-15d6-4a09-9545-38486d0eb236.json} (82%) rename data/helm_lite/AlephAlpha/luminous-supreme/{62727554-ab2c-4218-9c3c-3eba48420834.json => eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json} (82%) rename data/helm_lite/ai21/j2-grande/{c58c4299-ede8-46b6-8d33-2f900c272853.json => 52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json} (82%) rename data/helm_lite/ai21/j2-jumbo/{bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json => 68713712-ae92-474b-84c0-1b8301538439.json} (82%) rename data/helm_lite/ai21/jamba-1.5-large/{38918b97-2707-4b53-99a8-7a67816f398c.json => 15cc9411-6ea4-4f10-831f-23ff27fd5704.json} (82%) rename data/helm_lite/ai21/jamba-1.5-mini/{82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json => 3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json} (82%) rename data/helm_lite/ai21/jamba-instruct/{9278a23a-cecd-446c-b234-2301e1e44c40.json => 1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json} (82%) rename data/helm_lite/allenai/olmo-7b/{81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json => 078d812b-2198-4497-8fbe-06fb640fd86d.json} (82%) rename data/helm_lite/amazon/{nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json => nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json} (82%) rename data/helm_lite/amazon/{nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json => nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json} (82%) rename data/helm_lite/amazon/{nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json => nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json} (82%) rename data/helm_lite/anthropic/claude-2.0/{b2b9e87c-76de-4716-8d28-4b13a34c360f.json => 0684c1d2-ea43-4341-820c-09051f5e11f2.json} (82%) rename data/helm_lite/anthropic/claude-2.1/{0bd11df6-a037-4f55-a78a-cc23c34c0958.json => 51821ca1-7eac-4094-abac-98b2484cc5a0.json} (82%) rename data/helm_lite/anthropic/claude-3-5-haiku-20241022/{f4061c6a-f82f-4642-a734-f6adb0be7519.json => 8a0f5749-7f6a-4813-9c08-7283433c1337.json} (82%) rename data/helm_lite/anthropic/claude-3-5-sonnet-20240620/{18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json => 4697983d-a29a-484d-9268-7974117456e8.json} (82%) rename data/helm_lite/anthropic/claude-3-5-sonnet-20241022/{d0cd5626-5b2c-46df-b265-e130a789a0e7.json => 60e33aa3-0593-42e6-9baa-8311746deca0.json} (82%) rename data/helm_lite/anthropic/claude-3-haiku-20240307/{3eea5b0f-1126-448f-94e5-52a874baa61a.json => 2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json} (82%) rename data/helm_lite/anthropic/claude-3-opus-20240229/{9fa44303-4699-47f2-9777-0c118e36d87e.json => 9ad91ee2-7a64-4f94-9166-f2681777023b.json} (82%) rename data/helm_lite/anthropic/claude-3-sonnet-20240229/{a2d019d6-52bf-439f-90f0-74583928e5c0.json => 4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json} (82%) rename data/helm_lite/anthropic/claude-instant-1.2/{0f884c98-ea5e-4409-81e2-40aa5c84f99d.json => 64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json} (82%) rename data/helm_lite/anthropic/claude-v1.3/{2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json => fe8a36b0-4361-461b-b310-656c54131fa6.json} (82%) rename data/helm_lite/cohere/command-light/{8c312031-5da7-4816-8207-056fe1bc161d.json => b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json} (82%) rename data/helm_lite/cohere/command-r-plus/{71c0558f-7b56-40ea-a1be-2749b88758c7.json => 67967a2a-5fb4-46e8-b1ec-eda1588d9086.json} (82%) rename data/helm_lite/cohere/command-r/{d1330068-2c16-450e-8ce5-1d05f5e842d9.json => 0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json} (82%) rename data/helm_lite/cohere/command/{dec04718-1ae9-4e4b-92da-01d789424f69.json => ba5eea81-2120-4a20-8322-dfbd29cd197c.json} (82%) rename data/helm_lite/databricks/dbrx-instruct/{ba50499a-6cfd-4f04-aab5-c2122202cc74.json => 9dd66ede-da5c-4627-92ed-7057c9a2bea3.json} (82%) rename data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/{35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json => 801aa7da-90b2-48d1-ad3d-943b06bd437c.json} (82%) rename data/helm_lite/deepseek-ai/deepseek-v3/{d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json => a58923ea-fa22-4c45-8327-efbe84c8a05d.json} (82%) rename data/helm_lite/google/gemini-1.0-pro-002/{1e98157d-49e6-4d66-ae21-a95d419c47e3.json => bab8d241-fad0-4230-b213-c2eeccc79f12.json} (82%) rename data/helm_lite/google/gemini-1.5-flash-001/{e92bce18-690a-44eb-8bc5-28e9303473bb.json => 65e37589-ef26-46cd-a627-798af70e75bf.json} (82%) rename data/helm_lite/google/gemini-1.5-flash-002/{3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json => f499f9c6-4c9a-43ba-b4c3-d094494a371c.json} (82%) rename data/helm_lite/google/gemini-1.5-pro-001/{b1ecfc78-f59e-437f-b163-9253ad092799.json => 27a54446-57b2-4239-b768-7ab85dc94c54.json} (82%) rename data/helm_lite/google/gemini-1.5-pro-002/{04415dda-306f-420c-8af8-54336368fc40.json => 5de8a13e-a029-4a90-9a2d-c28a59212140.json} (82%) rename data/helm_lite/google/gemini-2.0-flash-exp/{ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json => f9643ce2-7347-401b-903e-fadcc5221f36.json} (82%) rename data/helm_lite/google/gemma-2-27b-it/{5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json => 9932e430-2039-40b0-bc8f-ae2d833543e8.json} (82%) rename data/helm_lite/google/gemma-2-9b-it/{63af45df-c46d-46df-8f3e-592181ce6a7a.json => dbd2e9bb-c2ca-4165-b229-d736a70721a5.json} (82%) rename data/helm_lite/google/gemma-7b/{aad88f1f-6047-45e7-8b0f-d5deac20be68.json => 32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json} (82%) rename data/helm_lite/google/text-bison@001/{f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json => 70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json} (82%) rename data/helm_lite/google/text-unicorn@001/{35f70e20-8a08-4f7c-b822-5238337d4177.json => 07a367ee-2879-4ede-bbf8-33b24d682467.json} (82%) rename data/helm_lite/meta/llama-2-13b/{e19c56fc-5f6c-48a0-874a-97665283e6f0.json => fee914c7-d6bf-4d61-9f50-71bae5f11006.json} (82%) rename data/helm_lite/meta/llama-2-70b/{98a0c9bb-9679-4cc5-85b8-8801dbb965de.json => b0577066-231e-461b-bae8-b724b204397a.json} (82%) rename data/helm_lite/meta/llama-2-7b/{fad21bfe-048f-412c-b3fd-9b43d276b2a2.json => b79fe2e3-5eec-46f8-90a1-810781c8c46a.json} (82%) rename data/helm_lite/meta/llama-3-70b/{b1e28406-d88d-4acd-a268-7baebc9b565a.json => 998616ef-5d1b-4c65-b6ad-23afc3630d5a.json} (82%) rename data/helm_lite/meta/llama-3-8b/{60696eaf-669d-49bf-bebe-6cd171522faa.json => fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json} (82%) rename data/helm_lite/meta/llama-3.1-405b-instruct-turbo/{ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json => 25fde5e6-86b8-4a80-8f79-5946ef9999fc.json} (82%) rename data/helm_lite/meta/llama-3.1-70b-instruct-turbo/{c3b72d96-9af5-4e32-b420-e85a88e82e5a.json => b955825d-ae7f-48c4-9dad-5ee78879737d.json} (82%) rename data/helm_lite/meta/llama-3.1-8b-instruct-turbo/{57b2177d-0232-41ca-aa3a-b2ecb7af7586.json => 168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json} (82%) rename data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/{6ed32ce2-18e5-4d1b-94f8-443f81892275.json => 0807e353-9787-4ca0-8f7b-50d1bed2469e.json} (82%) rename data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/{5c11f938-7933-45ae-8530-05dac1012f10.json => 0164b885-2c27-4eba-8e6f-e69156cb0dee.json} (82%) rename data/helm_lite/meta/llama-3.3-70b-instruct-turbo/{2b9e00e5-15e1-45ea-a345-32a3d84460fb.json => 08422837-51a0-45c9-9004-fc5d98dce462.json} (82%) rename data/helm_lite/meta/llama-65b/{3e27a5c3-a752-4790-b219-5964331e40ac.json => 39f2c7f2-56d4-4349-95ae-374d34263f48.json} (82%) rename data/helm_lite/microsoft/phi-2/{061081c1-6044-40ec-b4a7-1668b8f3ba4f.json => 0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json} (82%) rename data/helm_lite/microsoft/phi-3-medium-4k-instruct/{33df0ce7-048b-4a1b-816c-a6221afe41de.json => 75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json} (82%) rename data/helm_lite/microsoft/phi-3-small-8k-instruct/{a3f47cc2-0563-4285-b777-0fcc3c642249.json => 2de4b89a-3f3b-4d1d-ba85-030953a46956.json} (82%) rename data/helm_lite/mistralai/mistral-7b-instruct-v0.3/{067ef4d7-387c-4c09-a1c4-a10af69811f0.json => bd68405f-fe9a-448b-9c80-468c656594e5.json} (82%) rename data/helm_lite/mistralai/mistral-7b-v0.1/{0a07f39c-745a-46c3-ad11-c79a50cc18bb.json => 4267fef1-3180-46e3-990e-0d1092ec4c18.json} (82%) rename data/helm_lite/mistralai/mistral-large-2402/{35797854-d46a-4646-94a2-3acf1d484418.json => 002a34dc-39e5-451d-b2a8-b51bdb69a056.json} (82%) rename data/helm_lite/mistralai/mistral-large-2407/{3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json => 5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json} (82%) rename data/helm_lite/mistralai/mistral-medium-2312/{33bd2b4e-0292-47b7-84de-de6ff5804257.json => ad2beded-cec3-4b47-b8de-a32a3225fa66.json} (82%) rename data/helm_lite/mistralai/mistral-small-2402/{67edb54d-efed-4a23-97ef-6d2a9f254ae1.json => eb901347-fc1f-4d8f-a70a-05a83e16658d.json} (82%) rename data/helm_lite/mistralai/mixtral-8x22b/{ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json => 9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json} (82%) rename data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/{469d069f-581e-415c-9c9d-f57e7c972da5.json => 042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json} (82%) rename data/helm_lite/mistralai/open-mistral-nemo-2407/{c9a3f927-041f-47cf-ae02-03fe4be0a59e.json => d2d48e4a-0484-4f44-8108-2e689d7ca695.json} (82%) rename data/helm_lite/openai/gpt-3.5-turbo-0613/{1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json => e54ae605-a91d-47d7-a08d-67bd0ea5c606.json} (82%) rename data/helm_lite/openai/gpt-4-0613/{4e58fdd9-e14c-441a-a9fb-4c525a615880.json => 15dccf75-871d-457b-8495-e0d03d550360.json} (82%) rename data/helm_lite/openai/gpt-4-1106-preview/{252ec309-9b98-463e-aee4-6e28deb6dcfb.json => 18fe5d30-bf36-405a-819e-1ecabda327ea.json} (82%) rename data/helm_lite/openai/gpt-4-turbo-2024-04-09/{5530c426-2321-4aa3-b860-f9b764b7b748.json => cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json} (82%) rename data/helm_lite/openai/gpt-4o-2024-05-13/{da92cfe0-b066-416a-9408-3eb9d36b9fb3.json => cd199905-04a4-4745-b848-4f7bde97ca17.json} (82%) rename data/helm_lite/openai/gpt-4o-2024-08-06/{2a752701-a826-4316-b3eb-e9eec90a5a89.json => 1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json} (82%) rename data/helm_lite/openai/gpt-4o-mini-2024-07-18/{bea4af4b-8155-4784-9192-b40270d574af.json => bfd70aff-bf45-4f55-b730-4924afc181cd.json} (82%) rename data/helm_lite/openai/text-davinci-002/{d08eccd1-602c-4d64-a487-2d9c028459a0.json => b6e08679-1bd7-42a1-9eee-98252de2c7c1.json} (82%) rename data/helm_lite/openai/text-davinci-003/{3cceb22d-7ce9-49a1-a677-548a97c52970.json => 22b411d5-a314-4b17-a9c7-c1af7ca7df33.json} (82%) rename data/helm_lite/qwen/qwen1.5-110b-chat/{6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json => f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json} (82%) rename data/helm_lite/qwen/qwen1.5-14b/{9b1ee735-bc25-48fd-94cd-24f17edcdc21.json => fb1bb023-16f6-4914-889b-6458d7ab1277.json} (82%) rename data/helm_lite/qwen/qwen1.5-32b/{a648cb90-bcce-4171-a664-df0b19304833.json => 8b572c10-3553-4e51-a321-bdb05996914b.json} (82%) rename data/helm_lite/qwen/qwen1.5-72b/{5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json => 6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json} (82%) rename data/helm_lite/qwen/qwen1.5-7b/{71d69629-11b9-4f06-98ca-536f1ab22f2c.json => e0efe169-d28e-418e-a78c-9b04ec29aae2.json} (82%) rename data/helm_lite/qwen/qwen2-72b-instruct/{a594b434-eeb2-41f5-b23d-eea23ed2add2.json => 05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json} (82%) rename data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/{e6a833e5-6b86-4d32-be03-010fdfde3ffc.json => 983696ae-d7f3-48a4-b7a0-a42487728182.json} (82%) rename data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/{cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json => a969e516-adef-4839-9252-244c58ab3c67.json} (82%) rename data/helm_lite/snowflake/snowflake-arctic-instruct/{2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json => f122f9de-b1ce-40ea-8731-6c00c7af0498.json} (82%) rename data/helm_lite/tiiuae/falcon-40b/{346c2a85-3daf-41e9-9305-78851dcf05ae.json => 5c7982c5-3513-4ff2-9857-33a0db825376.json} (82%) rename data/helm_lite/tiiuae/falcon-7b/{69e02d7b-d536-4ff4-a58e-b880ff87f357.json => 4910859a-750c-4728-bf30-309e0e81690e.json} (82%) rename data/helm_lite/upstage/solar-pro-241126/{3286a69f-cdba-49a5-939a-e14ad759e7a4.json => 32f0532f-b504-492d-84d7-f541930edad0.json} (82%) rename data/helm_lite/writer/palmyra-x-004/{b798adc1-01f0-46c5-95a4-8b67199d624b.json => 04c187a3-4532-4523-b39d-19314d61c779.json} (82%) rename data/helm_lite/writer/palmyra-x-v2/{7a07a202-aa88-47fc-987d-6d44a57b6985.json => 4440532c-9b49-4c9a-8bf4-f122531c54fa.json} (82%) rename data/helm_lite/writer/palmyra-x-v3/{ac0a7249-11e7-493d-9190-8c1913bb1c42.json => bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json} (82%) rename data/helm_mmlu/01-ai/yi-34b/{73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json => 3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json} (82%) rename data/helm_mmlu/01-ai/yi-6b/{97569bf5-1e12-4baa-80cc-019be1725ebb.json => 6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json} (82%) rename data/helm_mmlu/01-ai/yi-large-preview/{7c4b387f-45be-41cb-8102-cd738e60f99d.json => 3d0b3d68-a853-4989-a35e-83ac6722c2da.json} (82%) rename data/helm_mmlu/ai21/jamba-1.5-large/{027b7bd4-8943-4d2c-9674-15d33792d391.json => ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json} (82%) rename data/helm_mmlu/ai21/jamba-1.5-mini/{e5ed6c70-6874-4671-abb0-25bbd82471b4.json => 517e8027-6edd-482b-86f3-33b6c41a9609.json} (82%) rename data/helm_mmlu/ai21/jamba-instruct/{4e236f80-5d03-4547-b199-b8718439fbed.json => f7c1c125-ad0f-4847-b880-4f705f1666c6.json} (82%) rename data/helm_mmlu/allenai/olmo-1.7-7b/{1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json => 5a0ba280-8a12-4735-9d92-4ed71ba395b4.json} (82%) rename data/helm_mmlu/allenai/olmo-7b/{31666792-6d68-42da-95f8-3b9f8590c7fd.json => 73ccc6a6-e10d-4619-914f-26032cddf8da.json} (82%) rename data/helm_mmlu/amazon/{nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json => nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json} (82%) rename data/helm_mmlu/amazon/{nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json => nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json} (82%) rename data/helm_mmlu/amazon/{nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json => nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json} (82%) rename data/helm_mmlu/anthropic/claude-2.1/{357edc36-d500-4e6e-94a4-6653b769b5d8.json => aa8cae95-cb75-4241-951c-25e2046042dd.json} (82%) rename data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/{67f72a7f-15b7-4a2e-b478-38091cba2189.json => c88e4a03-22ae-4338-bf5f-36070814136a.json} (82%) rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/{3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json => 4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json} (82%) rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/{f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json => ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json} (82%) rename data/helm_mmlu/anthropic/claude-3-haiku-20240307/{b0218eab-984f-4829-90d6-e7fc6f60c530.json => 097a8da1-f411-4359-8440-2ab06f4ae76c.json} (82%) rename data/helm_mmlu/anthropic/claude-3-opus-20240229/{fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json => 68130abd-1df5-4cd3-919a-2863e9f013c7.json} (82%) rename data/helm_mmlu/anthropic/claude-3-sonnet-20240229/{08d951d1-2912-4a00-99ce-f90340a7fd2a.json => 5d8d795a-d213-4b96-9b17-ad5fae6b3687.json} (82%) rename data/helm_mmlu/anthropic/claude-instant-1.2/{bfff8f1b-24cc-41b8-b11c-85ee48bef059.json => 7908da03-f030-4c62-a121-c04bd94ea75e.json} (82%) rename data/helm_mmlu/cohere/command-r-plus/{f1509273-dea1-477e-bf04-02767838c1f9.json => c6fdbf96-2500-4410-8fcd-268ea3e16062.json} (82%) rename data/helm_mmlu/cohere/command-r/{45524eef-0678-47db-8620-a5f55e166e63.json => 537164c3-7b88-4543-b19d-370f55a25a66.json} (82%) rename data/helm_mmlu/databricks/dbrx-instruct/{cd2371e9-e552-4944-bc30-c2269c960e16.json => 0c539e26-8403-42db-acfc-7953dd80ae20.json} (82%) rename data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/{7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json => 364c7490-8bb1-4e7e-b485-fb3c2224da58.json} (82%) rename data/helm_mmlu/deepseek-ai/deepseek-v3/{87716ef9-56bb-4737-b578-9e53742c714a.json => 1a9167d2-882c-4582-b4e0-ac425896a317.json} (82%) rename data/helm_mmlu/google/gemini-1.0-pro-001/{8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json => 8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json} (82%) rename data/helm_mmlu/google/gemini-1.5-flash-001/{ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json => d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json} (82%) rename data/helm_mmlu/google/gemini-1.5-flash-002/{ec78481a-0b0d-4709-99ea-6423372d6038.json => a94c9e13-dca7-4e02-a795-09d9274354d3.json} (82%) rename data/helm_mmlu/google/gemini-1.5-flash-preview-0514/{2a8845b3-cdbc-409c-8346-f83fb607999a.json => 75c8b20f-a4d4-4699-be79-f027bf7f0d69.json} (82%) rename data/helm_mmlu/google/gemini-1.5-pro-001/{486b6479-f327-43ab-af2c-8824abaf5fe6.json => 264be7b4-08b7-40b6-a5e7-f3536f361450.json} (82%) rename data/helm_mmlu/google/gemini-1.5-pro-002/{4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json => 83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json} (82%) rename data/helm_mmlu/google/gemini-1.5-pro-preview-0409/{bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json => 8a013eb3-0f21-4a50-8a53-4ba977951130.json} (82%) rename data/helm_mmlu/google/gemini-2.0-flash-exp/{0837a2fd-1f25-4133-9ce6-b8ca29830f70.json => 7b081a40-7cb6-4405-b842-3db95f290dfa.json} (82%) rename data/helm_mmlu/google/gemma-2-27b/{b732e4c3-526e-42b3-8003-defe6f99dec5.json => 54185b53-9891-43c6-8f93-09ff02b728d8.json} (82%) rename data/helm_mmlu/google/gemma-2-9b/{72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json => 884c194d-6519-4bd4-8add-6514e593c514.json} (82%) rename data/helm_mmlu/google/gemma-7b/{11b66d50-28d9-42bc-8f91-463b02fa96f7.json => a80cbd76-bcf8-4174-b0b3-346fae152bdb.json} (82%) rename data/helm_mmlu/google/text-bison@001/{70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json => 5f105986-aa7d-4858-91bc-cece9d0085ba.json} (82%) rename data/helm_mmlu/google/text-unicorn@001/{c2e53d3a-b85c-4888-8b20-225db39301ab.json => 528b7b4e-c8a6-4387-bd98-497a3316029d.json} (82%) rename data/helm_mmlu/meta/llama-2-13b/{a477c332-b082-4ad5-8d2f-905690e9d211.json => 96eb34db-66bd-4945-8b4c-a8c1394fe56a.json} (82%) rename data/helm_mmlu/meta/llama-2-70b/{ba574f5e-cc59-4994-a595-e6472c032fc4.json => 961e917b-0e67-462c-b9d0-0fe4b4b85beb.json} (82%) rename data/helm_mmlu/meta/llama-2-7b/{9cfa7f91-bfd0-4f02-988c-1978df8db303.json => 59a85d2c-16ce-4ed4-bc65-f6898127fa57.json} (82%) rename data/helm_mmlu/meta/llama-3-70b/{607a4b9b-3442-4690-b116-a927c6822fb3.json => 16a8b446-51fc-4c23-9231-46ee16c1c0a8.json} (82%) rename data/helm_mmlu/meta/llama-3-8b/{44decfe6-57ed-4677-a859-4fe5ae25b237.json => f4de7e58-7060-440b-8f6f-1f79d7499d1e.json} (82%) rename data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/{af78c3b5-5d91-431d-85ac-783b5a324723.json => 5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json} (82%) rename data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/{1224cee0-22f8-41b0-a7da-8a6100001a3e.json => dc6aa933-67e4-4811-b3e2-e5200c002abe.json} (82%) rename data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/{2cb2551b-dbca-46d9-a19a-165d1ac60dee.json => 5f9758a3-fd6d-4598-930a-9c01420d05e8.json} (82%) rename data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/{3c53ce3d-4ee8-483c-be9f-964395103289.json => 7592c0d8-a06c-4189-81a1-dbf794d22c8b.json} (82%) rename data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/{11e364be-39e9-4b42-97d7-ab771f17973c.json => 83c0e8e3-087c-4d61-9153-e571b4971871.json} (82%) rename data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/{bbcf8f14-600c-4c93-b63d-64aabcab23a3.json => c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json} (82%) rename data/helm_mmlu/microsoft/phi-2/{91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json => 5baac093-babb-41cd-a2f4-985d0b91be37.json} (82%) rename data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/{e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json => 1bf54088-ba12-45b4-8f80-63d5c38f58f6.json} (82%) rename data/helm_mmlu/microsoft/phi-3-small-8k-instruct/{16c66bdf-dda3-4b12-b38c-73abee6a702f.json => 5ed0a970-200f-4f23-9623-e714afa49ddf.json} (82%) rename data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/{d0783259-681a-438f-b7dc-1c625a0be8ba.json => e7fd06a6-65e5-4f88-8e86-c513f78e31db.json} (82%) rename data/helm_mmlu/mistralai/mistral-7b-v0.1/{a05ce725-cdf0-4fe3-88b9-8631229e4443.json => ac047aef-008f-4c87-a6d5-4f331ebf5c53.json} (82%) rename data/helm_mmlu/mistralai/mistral-large-2402/{0dee4200-c4f0-438e-8d0d-ca92515c6e33.json => ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json} (82%) rename data/helm_mmlu/mistralai/mistral-large-2407/{2869d585-567d-4ddc-ac38-3e036061b13e.json => 7517b6c9-c613-416c-aadb-39fd6d252da7.json} (82%) rename data/helm_mmlu/mistralai/mistral-small-2402/{d277cca3-64da-4e4b-9210-3f5b910c975c.json => 85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json} (82%) rename data/helm_mmlu/mistralai/mixtral-8x22b/{cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json => df568c3c-8a5c-4455-836d-c980d7f5ea5c.json} (82%) rename data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/{0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json => 96e24977-ca6d-402c-bfd8-62be4cd9b902.json} (82%) rename data/helm_mmlu/mistralai/open-mistral-nemo-2407/{87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json => e5b2636a-8438-40c0-9f89-9f35585bf740.json} (82%) rename data/helm_mmlu/openai/gpt-3.5-turbo-0125/{48a0dd6b-9304-460a-8e4e-420c60dfa854.json => f3259d92-3c95-4b78-81ae-f7f4b80aec63.json} (82%) rename data/helm_mmlu/openai/gpt-3.5-turbo-0613/{1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json => 5ba23a34-4232-487f-b3e9-326d776135be.json} (82%) rename data/helm_mmlu/openai/gpt-4-0613/{8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json => 5bc1a462-f753-4259-91c3-a549491b2986.json} (82%) rename data/helm_mmlu/openai/gpt-4-1106-preview/{174ad35c-d6b5-49bd-930c-9c83608213a9.json => 16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json} (82%) rename data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/{348bbc24-09de-4d1e-98bc-079e87fea558.json => dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json} (82%) rename data/helm_mmlu/openai/gpt-4o-2024-05-13/{f37fc452-58f2-4d80-a71c-9331f7fe549e.json => 2ca11d4c-52e6-49ea-a5cb-238c0313c483.json} (82%) rename data/helm_mmlu/openai/gpt-4o-2024-08-06/{71df45d2-1a27-4ff2-853c-e853f809ff52.json => de400624-6c2e-47af-b851-54c4075c30ee.json} (82%) rename data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/{7c049135-a8bc-46ca-9a85-cba23e8696fd.json => 34441b3b-4d66-444c-af85-ca0666a48ed4.json} (82%) rename data/helm_mmlu/qwen/qwen1.5-110b-chat/{69737d19-682b-494f-b10b-fb788e83076b.json => eecf5e40-9110-47ea-a72b-9ba587b96e30.json} (82%) rename data/helm_mmlu/qwen/qwen1.5-14b/{c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json => f26fb123-c214-4d18-aea8-b05b4ea1819b.json} (82%) rename data/helm_mmlu/qwen/qwen1.5-32b/{ed668c03-e5df-4871-b2fa-876b2cda62f3.json => 30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json} (82%) rename data/helm_mmlu/qwen/qwen1.5-72b/{c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json => b152cd5c-cbc0-48f4-ba37-16878c3afba1.json} (82%) rename data/helm_mmlu/qwen/qwen1.5-7b/{1c743b00-0ca6-4332-9bb6-7f62190d74e3.json => dac223e9-3073-46f9-924b-c5a6408f5da9.json} (82%) rename data/helm_mmlu/qwen/qwen2-72b-instruct/{7f9317d3-b2bc-481d-9b28-9f305612ac58.json => a7a218ff-7afe-417c-ac39-cf305d592d56.json} (82%) rename data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/{7b3bc40a-a606-419d-b784-99697c1df5bc.json => 2e165735-43b8-4317-9cde-35aa4b5bcb26.json} (82%) rename data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/{d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json => 15c25bc5-7b1e-4771-bda2-fd04d74e1463.json} (82%) rename data/helm_mmlu/snowflake/snowflake-arctic-instruct/{cc68185c-6ee2-40bd-8951-f104d898c7f8.json => 26036c7c-e981-46e8-b5e9-dcd7d116af70.json} (82%) rename data/helm_mmlu/upstage/solar-pro-241126/{78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json => b3269e4e-98a7-4795-8ef3-fc87774a54b7.json} (82%) rename data/helm_mmlu/writer/palmyra-x-004/{ba74f375-fd6d-4bba-af63-605bd73c9b7f.json => 284fde9f-8570-4e6d-9190-e52d8723fe57.json} (82%) rename data/helm_mmlu/writer/palmyra-x-v3/{41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json => fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json} (82%) create mode 100755 scripts/HELM/parse_helm_leaderboards.sh diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json similarity index 80% rename from data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json rename to data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json index 42f19b810..28c2132cc 100644 --- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json +++ b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.475, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,14 +100,23 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -141,14 +162,23 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -194,11 +224,20 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -244,12 +283,21 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json similarity index 80% rename from data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json rename to data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json index c596a8093..c2c0ac804 100644 --- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json +++ b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.44, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,14 +100,23 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -141,14 +162,23 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -194,11 +224,20 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -244,12 +283,21 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json similarity index 80% rename from data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json rename to data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json index da8bb1b91..cbc2ce18e 100644 --- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json +++ b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.405, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,14 +100,23 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -141,14 +162,23 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -194,11 +224,20 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -244,12 +283,21 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json similarity index 80% rename from data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json rename to data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json index cb4638d3d..4bae095b1 100644 --- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json +++ b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.332, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,14 +100,23 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -141,14 +162,23 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -194,11 +224,20 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -244,12 +283,21 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json b/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json similarity index 80% rename from data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json rename to data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json index 0670c6db3..f34e2fca2 100644 --- a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json +++ b/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.551, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json b/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json similarity index 80% rename from data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json rename to data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json index 2c6f0abd0..da4fca4b9 100644 --- a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json +++ b/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.522, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json similarity index 81% rename from data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json rename to data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json index 3a64b94b2..7d306af4a 100644 --- a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json +++ b/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.637, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json b/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json similarity index 80% rename from data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json rename to data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json index bbdb8512b..9634c0423 100644 --- a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json +++ b/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.591, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json rename to data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json index 44b7ab97a..59583f434 100644 --- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json +++ b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Haiku (20241022)", + "name": "Claude 3.5 Haiku 20241022", "id": "anthropic/claude-3-5-haiku-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.549, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json rename to data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json index b8e94bdb5..050628b1e 100644 --- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json +++ b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20241022)", + "name": "Claude 3.5 Sonnet 20241022", "id": "anthropic/claude-3-5-sonnet-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.653, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json rename to data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json index a41bf85dc..325dd380e 100644 --- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json +++ b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.7 Sonnet (20250219)", + "name": "Claude 3.7 Sonnet 20250219", "id": "anthropic/claude-3-7-sonnet-20250219", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.674, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json new file mode 100644 index 000000000..82dc8fad1 --- /dev/null +++ b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Claude 4.5 Haiku 20251001", + "id": "anthropic/claude-haiku-4-5-20251001", + "developer": "anthropic", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.717, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 7.381503096938465 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.777, + "details": { + "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=3.701, mean=3.701, max=3.701, sum=3.701 (1)", + "tab": "Efficiency", + "score": 3.7008020806312563 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", + "tab": "General information", + "score": 252.461 + }, + "MMLU-Pro - # output tokens": { + "description": "min=374.129, mean=374.129, max=374.129, sum=374.129 (1)", + "tab": "General information", + "score": 374.129 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.605, + "details": { + "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=5.102, mean=5.102, max=5.102, sum=5.102 (1)", + "tab": "Efficiency", + "score": 5.102193982611857 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", + "tab": "General information", + "score": 272.73766816143495 + }, + "GPQA - # output tokens": { + "description": "min=524.525, mean=524.525, max=524.525, sum=524.525 (1)", + "tab": "General information", + "score": 524.5246636771301 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.801, + "details": { + "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=4.355, mean=4.355, max=4.355, sum=4.355 (1)", + "tab": "Efficiency", + "score": 4.355410516372229 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", + "tab": "General information", + "score": 47.15896487985213 + }, + "IFEval - # output tokens": { + "description": "min=390.416, mean=390.416, max=390.416, sum=390.416 (1)", + "tab": "General information", + "score": 390.4158964879852 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.839, + "details": { + "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=16.317, mean=16.317, max=16.317, sum=16.317 (1)", + "tab": "Efficiency", + "score": 16.317131044387818 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=1835.337, mean=1835.337, max=1835.337, sum=1835.337 (1)", + "tab": "General information", + "score": 1835.337 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.561, + "details": { + "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=7.432, mean=7.432, max=7.432, sum=7.432 (1)", + "tab": "Efficiency", + "score": 7.431977860689163 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", + "tab": "General information", + "score": 110.563 + }, + "Omni-MATH - # output tokens": { + "description": "min=937.799, mean=937.799, max=937.799, sum=937.799 (1)", + "tab": "General information", + "score": 937.799 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json rename to data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json index a9349e9cb..0e6c52fbd 100644 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json +++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Opus (20250514, extended thinking)", + "name": "Claude 4 Opus 20250514, extended thinking", "id": "anthropic/claude-opus-4-20250514-thinking-10k", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.78, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json rename to data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json index c82ca8963..7abaf15ac 100644 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json +++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Opus (20250514)", + "name": "Claude 4 Opus 20250514", "id": "anthropic/claude-opus-4-20250514", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.757, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json index 6bf01f358..f65747fef 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Sonnet (20250514, extended thinking)", + "name": "Claude 4 Sonnet 20250514, extended thinking", "id": "anthropic/claude-sonnet-4-20250514-thinking-10k", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.766, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json similarity index 80% rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json index af4facce4..98193fa4e 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Sonnet (20250514)", + "name": "Claude 4 Sonnet 20250514", "id": "anthropic/claude-sonnet-4-20250514", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.733, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json new file mode 100644 index 000000000..3583acbb0 --- /dev/null +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Claude 4.5 Sonnet 20250929", + "id": "anthropic/claude-sonnet-4-5-20250929", + "developer": "anthropic", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.762, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 17.536448448412127 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.869, + "details": { + "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=9.03, mean=9.03, max=9.03, sum=9.03 (1)", + "tab": "Efficiency", + "score": 9.029817205530268 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", + "tab": "General information", + "score": 252.461 + }, + "MMLU-Pro - # output tokens": { + "description": "min=392.292, mean=392.292, max=392.292, sum=392.292 (1)", + "tab": "General information", + "score": 392.292 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.686, + "details": { + "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=12.414, mean=12.414, max=12.414, sum=12.414 (1)", + "tab": "Efficiency", + "score": 12.414452127318263 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", + "tab": "General information", + "score": 272.73766816143495 + }, + "GPQA - # output tokens": { + "description": "min=544.215, mean=544.215, max=544.215, sum=544.215 (1)", + "tab": "General information", + "score": 544.2152466367713 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.85, + "details": { + "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=10.904, mean=10.904, max=10.904, sum=10.904 (1)", + "tab": "Efficiency", + "score": 10.90394415211986 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", + "tab": "General information", + "score": 47.15896487985213 + }, + "IFEval - # output tokens": { + "description": "min=414.632, mean=414.632, max=414.632, sum=414.632 (1)", + "tab": "General information", + "score": 414.63216266173754 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.854, + "details": { + "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=38.544, mean=38.544, max=38.544, sum=38.544 (1)", + "tab": "Efficiency", + "score": 38.54364204096484 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=1804.604, mean=1804.604, max=1804.604, sum=1804.604 (1)", + "tab": "General information", + "score": 1804.604 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.553, + "details": { + "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=16.79, mean=16.79, max=16.79, sum=16.79 (1)", + "tab": "Efficiency", + "score": 16.790386716127397 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", + "tab": "General information", + "score": 110.563 + }, + "Omni-MATH - # output tokens": { + "description": "min=892.774, mean=892.774, max=892.774, sum=892.774 (1)", + "tab": "General information", + "score": 892.774 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json similarity index 81% rename from data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json rename to data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json index 0b36b4b41..6cc5a7f14 100644 --- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json +++ b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.699, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json similarity index 81% rename from data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json rename to data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json index 3502a2f83..46c4843d4 100644 --- a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json +++ b/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.665, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json b/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json similarity index 80% rename from data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json rename to data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json index 9cecc3e6e..26e2e73d6 100644 --- a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json +++ b/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (002)", + "name": "Gemini 1.5 Flash 002", "id": "google/gemini-1.5-flash-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.609, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json b/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json similarity index 80% rename from data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json rename to data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json index c41c3cf10..1157dc164 100644 --- a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json +++ b/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (002)", + "name": "Gemini 1.5 Pro 002", "id": "google/gemini-1.5-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.657, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json b/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json similarity index 81% rename from data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json rename to data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json index 963d02bef..68450c9bd 100644 --- a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json +++ b/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.679, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json similarity index 80% rename from data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json rename to data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json index 87e886284..1bc6a5842 100644 --- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json +++ b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.0 Flash Lite (02-05 preview)", + "name": "Gemini 2.0 Flash Lite 02-05 preview", "id": "google/gemini-2.0-flash-lite-preview-02-05", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.642, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json similarity index 81% rename from data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json rename to data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json index a5294b486..f9f820a96 100644 --- a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json +++ b/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.591, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json similarity index 80% rename from data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json rename to data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json index d0e1ed757..7f7987a29 100644 --- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json +++ b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.5 Flash (04-17 preview)", + "name": "Gemini 2.5 Flash 04-17 preview", "id": "google/gemini-2.5-flash-preview-04-17", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.626, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json similarity index 80% rename from data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json rename to data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json index f1093c814..c845227fa 100644 --- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json +++ b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.5 Pro (03-25 preview)", + "name": "Gemini 2.5 Pro 03-25 preview", "id": "google/gemini-2.5-pro-preview-03-25", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.745, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json b/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json new file mode 100644 index 000000000..e4e82cd5b --- /dev/null +++ b/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Gemini 3 Pro Preview", + "id": "google/gemini-3-pro-preview", + "developer": "google", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.799, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 50.969324812798575 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.903, + "details": { + "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=34.903, mean=34.903, max=34.903, sum=34.903 (1)", + "tab": "Efficiency", + "score": 34.903078527212145 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)", + "tab": "General information", + "score": 263.673 + }, + "MMLU-Pro - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.803, + "details": { + "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=69.164, mean=69.164, max=69.164, sum=69.164 (1)", + "tab": "Efficiency", + "score": 69.16407415364355 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)", + "tab": "General information", + "score": 273.7354260089686 + }, + "GPQA - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.876, + "details": { + "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=18.201, mean=18.201, max=18.201, sum=18.201 (1)", + "tab": "Efficiency", + "score": 18.200553727458452 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", + "tab": "General information", + "score": 47.33086876155268 + }, + "IFEval - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.859, + "details": { + "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=37.094, mean=37.094, max=37.094, sum=37.094 (1)", + "tab": "Efficiency", + "score": 37.09404513451669 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.555, + "details": { + "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=95.485, mean=95.485, max=95.485, sum=95.485 (1)", + "tab": "Efficiency", + "score": 95.48487252116203 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", + "tab": "General information", + "score": 111.956 + }, + "Omni-MATH - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json similarity index 81% rename from data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json rename to data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json index 42be38419..828363b5a 100644 --- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json +++ b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.463, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json b/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json new file mode 100644 index 000000000..8203eb4c6 --- /dev/null +++ b/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "IBM Granite 4.0 Small", + "id": "ibm/granite-4.0-h-small", + "developer": "ibm", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.575, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 21.31162992088884 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.569, + "details": { + "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=12.071, mean=12.071, max=12.071, sum=12.071 (1)", + "tab": "Efficiency", + "score": 12.070928404092788 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)", + "tab": "General information", + "score": 288.391 + }, + "MMLU-Pro - # output tokens": { + "description": "min=372.93, mean=372.93, max=372.93, sum=372.93 (1)", + "tab": "General information", + "score": 372.93 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.383, + "details": { + "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=17.606, mean=17.606, max=17.606, sum=17.606 (1)", + "tab": "Efficiency", + "score": 17.606201725690354 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)", + "tab": "General information", + "score": 303.2645739910314 + }, + "GPQA - # output tokens": { + "description": "min=439.648, mean=439.648, max=439.648, sum=439.648 (1)", + "tab": "General information", + "score": 439.6479820627803 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.89, + "details": { + "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=13.366, mean=13.366, max=13.366, sum=13.366 (1)", + "tab": "Efficiency", + "score": 13.366226098453712 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", + "tab": "General information", + "score": 51.53419593345656 + }, + "IFEval - # output tokens": { + "description": "min=494.717, mean=494.717, max=494.717, sum=494.717 (1)", + "tab": "General information", + "score": 494.7171903881701 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.739, + "details": { + "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=30.807, mean=30.807, max=30.807, sum=30.807 (1)", + "tab": "Efficiency", + "score": 30.80672695994377 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=996.159, mean=996.159, max=996.159, sum=996.159 (1)", + "tab": "General information", + "score": 996.159 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.296, + "details": { + "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=32.708, mean=32.708, max=32.708, sum=32.708 (1)", + "tab": "Efficiency", + "score": 32.70806641626358 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", + "tab": "General information", + "score": 118.438 + }, + "Omni-MATH - # output tokens": { + "description": "min=1020.51, mean=1020.51, max=1020.51, sum=1020.51 (1)", + "tab": "General information", + "score": 1020.51 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json b/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json new file mode 100644 index 000000000..bfe399026 --- /dev/null +++ b/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "IBM Granite 4.0 Micro", + "id": "ibm/granite-4.0-micro", + "developer": "ibm", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.486, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 5.725128505637726 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.395, + "details": { + "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=3.135, mean=3.135, max=3.135, sum=3.135 (1)", + "tab": "Efficiency", + "score": 3.1348352246284485 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)", + "tab": "General information", + "score": 288.391 + }, + "MMLU-Pro - # output tokens": { + "description": "min=325.255, mean=325.255, max=325.255, sum=325.255 (1)", + "tab": "General information", + "score": 325.255 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.307, + "details": { + "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=3.075, mean=3.075, max=3.075, sum=3.075 (1)", + "tab": "Efficiency", + "score": 3.075281912970436 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)", + "tab": "General information", + "score": 303.2645739910314 + }, + "GPQA - # output tokens": { + "description": "min=337.417, mean=337.417, max=337.417, sum=337.417 (1)", + "tab": "General information", + "score": 337.4170403587444 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.849, + "details": { + "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=4.58, mean=4.58, max=4.58, sum=4.58 (1)", + "tab": "Efficiency", + "score": 4.580414981806785 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", + "tab": "General information", + "score": 51.53419593345656 + }, + "IFEval - # output tokens": { + "description": "min=497.8, mean=497.8, max=497.8, sum=497.8 (1)", + "tab": "General information", + "score": 497.8003696857671 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.67, + "details": { + "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=8.161, mean=8.161, max=8.161, sum=8.161 (1)", + "tab": "Efficiency", + "score": 8.160923891305924 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=1037.706, mean=1037.706, max=1037.706, sum=1037.706 (1)", + "tab": "General information", + "score": 1037.706 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.209, + "details": { + "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=9.674, mean=9.674, max=9.674, sum=9.674 (1)", + "tab": "Efficiency", + "score": 9.674186517477036 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", + "tab": "General information", + "score": 118.438 + }, + "Omni-MATH - # output tokens": { + "description": "min=1145.889, mean=1145.889, max=1145.889, sum=1145.889 (1)", + "tab": "General information", + "score": 1145.889 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json b/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json similarity index 80% rename from data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json rename to data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json index 3622da7c6..215be80f3 100644 --- a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json +++ b/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.325, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,14 +100,23 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -141,14 +162,23 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -194,11 +224,20 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -244,12 +283,21 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json similarity index 80% rename from data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json rename to data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json index 6e7a59864..41fd4d1af 100644 --- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json +++ b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (405B)", + "name": "Llama 3.1 Instruct Turbo 405B", "id": "meta/llama-3.1-405b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.618, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json similarity index 80% rename from data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json rename to data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json index 9ba719da5..7e6e617b7 100644 --- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json +++ b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (70B)", + "name": "Llama 3.1 Instruct Turbo 70B", "id": "meta/llama-3.1-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.574, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json similarity index 80% rename from data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json rename to data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json index 4657892fd..0c2bb79e7 100644 --- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json +++ b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (8B)", + "name": "Llama 3.1 Instruct Turbo 8B", "id": "meta/llama-3.1-8b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.444, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json similarity index 80% rename from data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json rename to data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json index 9c2141acc..71c8e88c3 100644 --- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json +++ b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 4 Maverick (17Bx128E) Instruct FP8", + "name": "Llama 4 Maverick 17Bx128E Instruct FP8", "id": "meta/llama-4-maverick-17b-128e-instruct-fp8", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.718, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json similarity index 80% rename from data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json rename to data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json index 2d19156dc..35aef174b 100644 --- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json +++ b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 4 Scout (17Bx16E) Instruct", + "name": "Llama 4 Scout 17Bx16E Instruct", "id": "meta/llama-4-scout-17b-16e-instruct", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.644, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json similarity index 80% rename from data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json rename to data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json index 6663598e4..ee064ad73 100644 --- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json +++ b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Instruct v0.3 (7B)", + "name": "Mistral Instruct v0.3 7B", "id": "mistralai/mistral-7b-instruct-v0.3", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.376, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json b/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json similarity index 81% rename from data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json rename to data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json index db1fa9b82..f4fd3ec06 100644 --- a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json +++ b/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large (2411)", + "name": "Mistral Large 2411", "id": "mistralai/mistral-large-2411", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.598, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json b/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json similarity index 80% rename from data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json rename to data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json index 69ce74931..ff90f0105 100644 --- a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json +++ b/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Small 3.1 (2503)", + "name": "Mistral Small 3.1 2503", "id": "mistralai/mistral-small-2503", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.558, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json similarity index 80% rename from data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json rename to data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json index 2dfb94872..703963331 100644 --- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json +++ b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral Instruct (8x22B)", + "name": "Mixtral Instruct 8x22B", "id": "mistralai/mixtral-8x22b-instruct-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.478, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json similarity index 80% rename from data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json rename to data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json index 293d11168..c522fd879 100644 --- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json +++ b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral Instruct (8x7B)", + "name": "Mixtral Instruct 8x7B", "id": "mistralai/mixtral-8x7b-instruct-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.397, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json similarity index 81% rename from data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json rename to data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json index 4c25e86d3..b69be21a9 100644 --- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json +++ b/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.768, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json similarity index 81% rename from data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json rename to data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json index c005600e1..17443bc6f 100644 --- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json +++ b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4.1 (2025-04-14)", + "name": "GPT-4.1 2025-04-14", "id": "openai/gpt-4.1-2025-04-14", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.727, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json similarity index 80% rename from data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json rename to data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json index d6481e60a..0342d7835 100644 --- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json +++ b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4.1 mini (2025-04-14)", + "name": "GPT-4.1 mini 2025-04-14", "id": "openai/gpt-4.1-mini-2025-04-14", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.726, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json similarity index 80% rename from data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json rename to data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json index e878bf385..15a7d0356 100644 --- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json +++ b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4.1 nano (2025-04-14)", + "name": "GPT-4.1 nano 2025-04-14", "id": "openai/gpt-4.1-nano-2025-04-14", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.616, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json similarity index 81% rename from data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json rename to data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json index ae08e8732..ed5380bd3 100644 --- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json +++ b/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-11-20)", + "name": "GPT-4o 2024-11-20", "id": "openai/gpt-4o-2024-11-20", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.634, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json similarity index 80% rename from data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json rename to data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json index c3aeb8ab5..e38c0ac88 100644 --- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json +++ b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o mini (2024-07-18)", + "name": "GPT-4o mini 2024-07-18", "id": "openai/gpt-4o-mini-2024-07-18", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.565, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json similarity index 81% rename from data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json rename to data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json index 2fd77c3d1..fb85b633b 100644 --- a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json +++ b/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-5 (2025-08-07)", + "name": "GPT-5 2025-08-07", "id": "openai/gpt-5-2025-08-07", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.807, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json similarity index 81% rename from data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json rename to data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json index cf4a0414b..3ca436502 100644 --- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json +++ b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-5 mini (2025-08-07)", + "name": "GPT-5 mini 2025-08-07", "id": "openai/gpt-5-mini-2025-08-07", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.819, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json similarity index 80% rename from data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json rename to data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json index a9996e0cd..e271e8724 100644 --- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json +++ b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-5 nano (2025-08-07)", + "name": "GPT-5 nano 2025-08-07", "id": "openai/gpt-5-nano-2025-08-07", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.748, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json new file mode 100644 index 000000000..492db1047 --- /dev/null +++ b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "GPT-5.1 2025-11-13", + "id": "openai/gpt-5.1-2025-11-13", + "developer": "openai", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.656, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 10.620566227529599 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.579, + "details": { + "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)", + "tab": "Efficiency", + "score": 1.1470122172832489 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", + "tab": "General information", + "score": 248.569 + }, + "MMLU-Pro - # output tokens": { + "description": "min=5.002, mean=5.002, max=5.002, sum=5.002 (1)", + "tab": "General information", + "score": 5.002 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.442, + "details": { + "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=1.002, mean=1.002, max=1.002, sum=1.002 (1)", + "tab": "Efficiency", + "score": 1.002433323539426 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", + "tab": "General information", + "score": 268.15246636771303 + }, + "GPQA - # output tokens": { + "description": "min=5.422, mean=5.422, max=5.422, sum=5.422 (1)", + "tab": "General information", + "score": 5.42152466367713 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.935, + "details": { + "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=13.159, mean=13.159, max=13.159, sum=13.159 (1)", + "tab": "Efficiency", + "score": 13.15882584436103 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", + "tab": "General information", + "score": 45.67097966728281 + }, + "IFEval - # output tokens": { + "description": "min=647.063, mean=647.063, max=647.063, sum=647.063 (1)", + "tab": "General information", + "score": 647.0628465804067 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.863, + "details": { + "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=28.081, mean=28.081, max=28.081, sum=28.081 (1)", + "tab": "Efficiency", + "score": 28.08133857488632 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=2059.716, mean=2059.716, max=2059.716, sum=2059.716 (1)", + "tab": "General information", + "score": 2059.716 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.464, + "details": { + "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=9.713, mean=9.713, max=9.713, sum=9.713 (1)", + "tab": "Efficiency", + "score": 9.713221177577973 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", + "tab": "General information", + "score": 109.623 + }, + "Omni-MATH - # output tokens": { + "description": "min=1256.266, mean=1256.266, max=1256.266, sum=1256.266 (1)", + "tab": "General information", + "score": 1256.266 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json b/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json similarity index 80% rename from data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json rename to data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json index 0b6f0418d..13795ec21 100644 --- a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json +++ b/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.77, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json b/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json similarity index 80% rename from data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json rename to data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json index 36043d89a..d2f755b28 100644 --- a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json +++ b/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.674, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json b/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json similarity index 80% rename from data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json rename to data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json index 2d017bb31..7455567bf 100644 --- a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json +++ b/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "o3 (2025-04-16)", + "name": "o3 2025-04-16", "id": "openai/o3-2025-04-16", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.811, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json similarity index 80% rename from data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json rename to data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json index db654a7b8..c33228ef1 100644 --- a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json +++ b/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "o4-mini (2025-04-16)", + "name": "o4-mini 2025-04-16", "id": "openai/o4-mini-2025-04-16", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.812, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json similarity index 80% rename from data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json rename to data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json index 7bc9ee7ae..31467bc1e 100644 --- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json +++ b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (72B)", + "name": "Qwen2.5 Instruct Turbo 72B", "id": "qwen/qwen2.5-72b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.599, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json similarity index 80% rename from data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json rename to data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json index 921d79480..0ac7225b8 100644 --- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json +++ b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (7B)", + "name": "Qwen2.5 Instruct Turbo 7B", "id": "qwen/qwen2.5-7b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.529, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json similarity index 81% rename from data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json rename to data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json index 7bc1c5881..1d36e4190 100644 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json +++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.726, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json similarity index 81% rename from data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json rename to data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json index 355119fa7..04fc2f6cc 100644 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json +++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.798, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json new file mode 100644 index 000000000..bbcecd669 --- /dev/null +++ b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Qwen3-Next 80B A3B Thinking", + "id": "qwen/qwen3-next-80b-a3b-thinking", + "developer": "qwen", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 27.61164260375731 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.786, + "details": { + "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=20.097, mean=20.097, max=20.097, sum=20.097 (1)", + "tab": "Efficiency", + "score": 20.09722422862053 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)", + "tab": "General information", + "score": 259.715 + }, + "MMLU-Pro - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.63, + "details": { + "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=40.06, mean=40.06, max=40.06, sum=40.06 (1)", + "tab": "Efficiency", + "score": 40.06039341950096 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)", + "tab": "General information", + "score": 274.36995515695065 + }, + "GPQA - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.81, + "details": { + "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=13.893, mean=13.893, max=13.893, sum=13.893 (1)", + "tab": "Efficiency", + "score": 13.89268838323639 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", + "tab": "General information", + "score": 46.491682070240294 + }, + "IFEval - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.807, + "details": { + "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=23.095, mean=23.095, max=23.095, sum=23.095 (1)", + "tab": "Efficiency", + "score": 23.095464605808257 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.467, + "details": { + "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=40.912, mean=40.912, max=40.912, sum=40.912 (1)", + "tab": "Efficiency", + "score": 40.91244238162041 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", + "tab": "General information", + "score": 111.6 + }, + "Omni-MATH - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json b/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json similarity index 80% rename from data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json rename to data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json index cf2b63d2e..da11997be 100644 --- a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json +++ b/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-fin/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.577, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json b/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json similarity index 80% rename from data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json rename to data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json index 0d8108574..78088b82e 100644 --- a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json +++ b/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-med/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-med/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.476, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json b/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json similarity index 80% rename from data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json rename to data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json index 8b3240898..b630b3cd0 100644 --- a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json +++ b/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.609, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json b/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json similarity index 80% rename from data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json rename to data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json index ba834a256..c212295f1 100644 --- a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json +++ b/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x5/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.696, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json b/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json similarity index 80% rename from data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json rename to data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json index 7640dfe10..34f4be43b 100644 --- a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json +++ b/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/xai_grok-3-beta/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.727, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json b/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json similarity index 80% rename from data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json rename to data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json index 8570e4d80..31dab978e 100644 --- a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json +++ b/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.679, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json b/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json similarity index 80% rename from data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json rename to data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json index b9fbeb3c0..872c4f1f5 100644 --- a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json +++ b/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/xai_grok-4-0709/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Grok 4 (0709)", + "name": "Grok 4 0709", "id": "xai/grok-4-0709", "developer": "xai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.785, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json similarity index 81% rename from data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json rename to data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json index b4d4807f0..4ead0f554 100644 --- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json +++ b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770830201.581632", + "retrieved_timestamp": "1770830201.581632", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.67, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,10 +42,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU-Pro - COT correct", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -88,13 +100,22 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "GPQA - COT correct", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", "lower_is_better": false, @@ -140,13 +161,22 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { "evaluation_name": "IFEval - IFEval Strict Acc", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", "lower_is_better": false, @@ -191,10 +221,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WildBench - WB Score", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", "lower_is_better": false, @@ -240,11 +279,20 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { "evaluation_name": "Omni-MATH - Acc", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", "lower_is_better": false, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json similarity index 92% rename from data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json rename to data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json index fac51642a..58e2410df 100644 --- a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json +++ b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/anthropic_Anthropic-LM-v4-s3-52B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Anthropic-LM v4-s3 52B", - "id": "anthropic/Anthropic-LM-v4-s3-52B", - "developer": "anthropic", + "id": "Anthropic-LM-v4-s3-52B", + "developer": "unknown", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.78, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json b/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json similarity index 92% rename from data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json rename to data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json index 05d951313..4805e7ac8 100644 --- a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json +++ b/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.433, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json similarity index 92% rename from data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json rename to data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json index cc58c06c0..e47585440 100644 --- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json +++ b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.706, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json similarity index 92% rename from data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json rename to data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json index 0be03d012..bfd78fa42 100644 --- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json +++ b/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.517, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json similarity index 92% rename from data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json rename to data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json index 3239df52d..a1c2d2860 100644 --- a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json +++ b/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.285, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json similarity index 92% rename from data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json rename to data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json index 71ff2dc38..e9db23ac9 100644 --- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json +++ b/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.743, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json similarity index 92% rename from data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json rename to data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json index ab1f54c90..38cd07e2a 100644 --- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json +++ b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.824, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json similarity index 91% rename from data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json rename to data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json index 14e3a243d..589346e15 100644 --- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json +++ b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.553, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json similarity index 92% rename from data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json rename to data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json index 9fccefc67..371a206a5 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json +++ b/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.315, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json similarity index 92% rename from data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json rename to data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json index 9f9536338..715673aae 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json +++ b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.485, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json similarity index 92% rename from data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json rename to data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json index ed0fa9dcd..5f8731441 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json +++ b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.662, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json b/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json similarity index 92% rename from data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json rename to data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json index 19831593f..04305416d 100644 --- a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json +++ b/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.446, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json b/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json similarity index 93% rename from data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json rename to data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json index af37640ca..1bbeba7ff 100644 --- a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json +++ b/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/bigscience_T0pp-11B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.197, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json similarity index 92% rename from data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json rename to data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json index 5eb323191..fadfb62da 100644 --- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json +++ b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.874, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json similarity index 92% rename from data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json rename to data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json index d20d6332d..b1c061a45 100644 --- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json +++ b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.675, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json similarity index 92% rename from data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json rename to data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json index 54182b504..bd838c107 100644 --- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json +++ b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.372, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json similarity index 92% rename from data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json rename to data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json index ecba92b3a..3e7a0f6fa 100644 --- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json +++ b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.23, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json similarity index 92% rename from data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json rename to data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json index 0b33b0763..745f99da6 100644 --- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json +++ b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.312, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json similarity index 92% rename from data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json rename to data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json index 4abc0c79b..478f77b1d 100644 --- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json +++ b/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.109, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json similarity index 92% rename from data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json rename to data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json index 6c362be4c..2039d0727 100644 --- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json +++ b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.56, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json similarity index 92% rename from data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json rename to data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json index f92b78094..216532187 100644 --- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json +++ b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.664, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json b/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json similarity index 91% rename from data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json rename to data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json index f6f9d6eae..8f5d16956 100644 --- a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json +++ b/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_Pythia-12B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Pythia 12B", - "id": "eleuther-ai/Pythia-12B", - "developer": "eleuther-ai", + "id": "eleutherai/Pythia-12B", + "developer": "eleutherai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.257, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json b/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json similarity index 91% rename from data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json rename to data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json index 2b488fa6f..20ca16498 100644 --- a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json +++ b/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_Pythia-6.9B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Pythia 6.9B", - "id": "eleuther-ai/Pythia-6.9B", - "developer": "eleuther-ai", + "id": "eleutherai/Pythia-6.9B", + "developer": "eleutherai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.196, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json b/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json similarity index 91% rename from data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json rename to data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json index 725954e16..d36f642d7 100644 --- a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json +++ b/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/writer_Palmyra-X-43B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Palmyra X 43B", - "id": "writer/Palmyra-X-43B", - "developer": "writer", + "id": "google/Palmyra-X-43B", + "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.732, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json b/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json similarity index 92% rename from data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json rename to data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json index 9bacd9bf9..0f7601506 100644 --- a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json +++ b/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/google_T5-11B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/google_T5-11B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.131, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json b/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json similarity index 92% rename from data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json rename to data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json index c9bf42a12..70193f3b0 100644 --- a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json +++ b/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/google_UL2-20B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/google_UL2-20B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.167, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json similarity index 91% rename from data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json rename to data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json index 65a179431..385ac9b25 100644 --- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json +++ b/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.706, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json similarity index 91% rename from data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json rename to data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json index bf5b7f8ab..3de9b1fd2 100644 --- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json +++ b/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.625, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json b/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json similarity index 91% rename from data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json rename to data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json index b007605c7..ac2da41ef 100644 --- a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json +++ b/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-13B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.595, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json b/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json similarity index 91% rename from data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json rename to data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json index 8e6647f52..1b33fd761 100644 --- a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json +++ b/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-30B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.781, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json b/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json similarity index 91% rename from data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json rename to data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json index 1dbaa6d85..a7d6351b2 100644 --- a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json +++ b/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-65B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-65B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.908, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json b/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json similarity index 91% rename from data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json rename to data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json index 4a772fb18..79b00a818 100644 --- a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json +++ b/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.533, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json b/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json similarity index 91% rename from data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json rename to data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json index de40c742e..170095f5b 100644 --- a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json +++ b/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_Llama-2-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_Llama-2-13B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.823, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json b/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json similarity index 91% rename from data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json rename to data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json index 77f6938f9..01e4b1b14 100644 --- a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json +++ b/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_Llama-2-70B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_Llama-2-70B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.944, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json b/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json similarity index 91% rename from data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json rename to data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json index 3b3b39208..40b71e7de 100644 --- a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json +++ b/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_Llama-2-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_Llama-2-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.607, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json b/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json similarity index 92% rename from data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json rename to data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json index 0da99434e..f864b9222 100644 --- a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json +++ b/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_OPT-175B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_OPT-175B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.609, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json b/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json similarity index 92% rename from data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json rename to data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json index 929a020d2..ba62ce7d1 100644 --- a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json +++ b/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_OPT-66B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_OPT-66B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.448, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json b/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json similarity index 92% rename from data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json rename to data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json index 786e640a5..ce5dcad88 100644 --- a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json +++ b/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.787, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json similarity index 92% rename from data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json rename to data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json index ade6f8a0a..cfa4e8177 100644 --- a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json +++ b/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.309, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json similarity index 91% rename from data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json rename to data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json index a4f716c06..738857e58 100644 --- a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json +++ b/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/mistral-ai_Mistral-v0.1-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Mistral v0.1 7B", - "id": "mistral-ai/Mistral-v0.1-7B", - "developer": "mistral-ai", + "id": "mistralai/Mistral-v0.1-7B", + "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.884, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json b/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json similarity index 91% rename from data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json rename to data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json index bf414b629..2580877d4 100644 --- a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json +++ b/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/mosaicml_MPT-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.714, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json similarity index 91% rename from data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json rename to data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json index dd4c71e77..a7cbf9856 100644 --- a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json +++ b/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.716, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json b/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json similarity index 92% rename from data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json rename to data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json index 64c16a070..c135cdcfb 100644 --- a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json +++ b/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_GPT-J-6B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_GPT-J-6B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "GPT-J 6B", - "id": "eleuther-ai/GPT-J-6B", - "developer": "eleuther-ai", + "id": "openai/GPT-J-6B", + "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.273, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json b/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json similarity index 92% rename from data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json rename to data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json index b26d9ed28..d4e4c3e18 100644 --- a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json +++ b/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_GPT-NeoX-20B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "GPT-NeoX 20B", - "id": "eleuther-ai/GPT-NeoX-20B", - "developer": "eleuther-ai", + "id": "openai/GPT-NeoX-20B", + "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.351, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json b/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json similarity index 94% rename from data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json rename to data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json index ee84122f5..ae351a8ab 100644 --- a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json +++ b/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_ada-350M/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_ada-350M/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.108, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json b/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json similarity index 94% rename from data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json rename to data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json index 3a55a8db1..4f76e1f1b 100644 --- a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json +++ b/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_babbage-1.3B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_babbage-1.3B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.114, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json b/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json similarity index 94% rename from data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json rename to data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json index d7959e7bb..84c344282 100644 --- a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json +++ b/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_curie-6.7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_curie-6.7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.247, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json b/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json similarity index 94% rename from data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json rename to data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json index 6b30fefef..fb0de7bd8 100644 --- a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json +++ b/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_davinci-175B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_davinci-175B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.538, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json similarity index 91% rename from data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json rename to data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json index 435cb040d..582bc2e6a 100644 --- a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json +++ b/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.76, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json similarity index 91% rename from data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json rename to data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json index bf7553bf6..5a9810e18 100644 --- a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json +++ b/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.783, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json b/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json similarity index 94% rename from data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json rename to data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json index d1a92ef67..0e0d9602b 100644 --- a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json +++ b/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-ada-001/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-ada-001/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.107, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json b/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json similarity index 94% rename from data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json rename to data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json index fb51f6a42..734c00775 100644 --- a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json +++ b/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-babbage-001/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-babbage-001/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.229, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json b/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json similarity index 94% rename from data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json rename to data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json index bb4d6e7ff..ba874427c 100644 --- a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json +++ b/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-curie-001/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-curie-001/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.36, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json b/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json similarity index 94% rename from data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json rename to data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json index 4d9b820e6..4555e0f80 100644 --- a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json +++ b/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-davinci-002/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-davinci-002/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.905, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json b/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json similarity index 94% rename from data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json rename to data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json index 437247369..7fd229e00 100644 --- a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json +++ b/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-davinci-003/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-davinci-003/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.872, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json b/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json similarity index 91% rename from data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json rename to data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json index 24ce27c0b..f68731052 100644 --- a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json +++ b/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/stanford_Alpaca-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.381, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json b/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json similarity index 91% rename from data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json rename to data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json index a08e1b6ca..85693f897 100644 --- a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json +++ b/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-40B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon 40B", - "id": "tii-uae/Falcon-40B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-40B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.729, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json b/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json similarity index 91% rename from data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json rename to data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json index 0911bfafa..e165123de 100644 --- a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json +++ b/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon 7B", - "id": "tii-uae/Falcon-7B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-7B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.378, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json similarity index 91% rename from data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json rename to data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json index 99345e7ef..3c1369c88 100644 --- a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json +++ b/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-40B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon-Instruct 40B", - "id": "tii-uae/Falcon-Instruct-40B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-Instruct-40B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.727, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json similarity index 91% rename from data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json rename to data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json index b0b75c2b1..19076bf3f 100644 --- a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json +++ b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon-Instruct 7B", - "id": "tii-uae/Falcon-Instruct-7B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-Instruct-7B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.244, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json rename to data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json index 66ae49567..90ced7618 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json +++ b/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.378, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json rename to data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json index f09058f3c..858c06ee0 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json +++ b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.311, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json rename to data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json index 9ed3b7bf9..e246416bd 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json +++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.524, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json rename to data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json index bb56f1198..828ab9683 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json +++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.366, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json b/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json similarity index 91% rename from data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json rename to data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json index add4859be..0d4ab9c94 100644 --- a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json +++ b/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.568, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json b/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json similarity index 91% rename from data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json rename to data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json index 74662144a..4b439ab57 100644 --- a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json +++ b/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/yandex_YaLM-100B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/yandex_YaLM-100B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.075, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json b/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json similarity index 91% rename from data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json rename to data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json index 2f17c575d..67e0f75ce 100644 --- a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json +++ b/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770830385.7573261", + "retrieved_timestamp": "1770830385.7573261", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.512, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,10 +77,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -142,10 +154,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "BoolQ - EM", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -235,10 +256,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -328,10 +358,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (open-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -496,10 +535,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "QuAC - F1", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -589,10 +637,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "HellaSwag - EM", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -657,10 +714,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -725,10 +791,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "TruthfulQA - EM", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -793,10 +868,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", "lower_is_better": false, @@ -951,10 +1035,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CNN/DailyMail - ROUGE-2", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1074,10 +1167,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "XSUM - ROUGE-2", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", "lower_is_better": false, @@ -1197,10 +1299,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "IMDB - EM", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1290,10 +1401,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "CivilComments - EM", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1383,10 +1503,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "RAFT - EM", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json b/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json similarity index 61% rename from data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json rename to data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json index 841d52f14..abd3e5e5b 100644 --- a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json +++ b/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770830411.78817", + "retrieved_timestamp": "1770830411.78817", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,14 +34,22 @@ "score_details": { "score": 0.611, "details": { - "description": null, "tab": "Instruction Following" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -53,30 +65,39 @@ } }, "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -92,18 +113,27 @@ } }, "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Koala test dataset - Harmlessness", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -119,16 +149,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Open Assistant - Harmlessness", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -144,17 +183,26 @@ } }, "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Self Instruct - Harmlessness", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -170,16 +218,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Vicuna - Harmlessness", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -195,13 +252,15 @@ } }, "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } } ] diff --git a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json b/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json similarity index 61% rename from data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json rename to data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json index 0905e2f21..3aea06a21 100644 --- a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json +++ b/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770830411.78817", + "retrieved_timestamp": "1770830411.78817", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Cohere Command beta (52.4B)", + "name": "Cohere Command beta 52.4B", "id": "cohere/command-xlarge-beta", "developer": "cohere", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,14 +34,22 @@ "score_details": { "score": 0.089, "details": { - "description": null, "tab": "Instruction Following" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -53,30 +65,39 @@ } }, "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -92,18 +113,27 @@ } }, "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Koala test dataset - Harmlessness", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -119,16 +149,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Open Assistant - Harmlessness", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -144,17 +183,26 @@ } }, "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Self Instruct - Harmlessness", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -170,16 +218,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Vicuna - Harmlessness", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -195,13 +252,15 @@ } }, "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } } ] diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json similarity index 61% rename from data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json rename to data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json index 4dc9e1ef5..31fd0891a 100644 --- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json +++ b/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770830411.78817", + "retrieved_timestamp": "1770830411.78817", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0613)", + "name": "GPT-3.5 Turbo 0613", "id": "openai/gpt-3.5-turbo-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,14 +34,22 @@ "score_details": { "score": 0.689, "details": { - "description": null, "tab": "Instruction Following" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -53,30 +65,39 @@ } }, "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -92,18 +113,27 @@ } }, "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Koala test dataset - Harmlessness", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -119,16 +149,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Open Assistant - Harmlessness", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -144,17 +183,26 @@ } }, "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Self Instruct - Harmlessness", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -170,16 +218,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Vicuna - Harmlessness", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -195,13 +252,15 @@ } }, "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } } ] diff --git a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json b/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json similarity index 61% rename from data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json rename to data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json index f76268b07..ac8e25cb0 100644 --- a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json +++ b/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/openai_gpt-4-0314/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770830411.78817", + "retrieved_timestamp": "1770830411.78817", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 (0314)", + "name": "GPT-4 0314", "id": "openai/gpt-4-0314", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,14 +34,22 @@ "score_details": { "score": 0.611, "details": { - "description": null, "tab": "Instruction Following" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -53,30 +65,39 @@ } }, "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -92,18 +113,27 @@ } }, "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Koala test dataset - Harmlessness", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -119,16 +149,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Open Assistant - Harmlessness", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -144,17 +183,26 @@ } }, "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Self Instruct - Harmlessness", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -170,16 +218,25 @@ } }, "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } }, { "evaluation_name": "Vicuna - Harmlessness", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, "metric_config": { "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", "lower_is_better": false, @@ -195,13 +252,15 @@ } }, "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } } } ] diff --git a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json b/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json similarity index 82% rename from data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json rename to data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json index 8d9b0c6e7..96c3d4d2d 100644 --- a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json +++ b/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/01-ai_yi-34b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/01-ai_yi-34b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (34B)", + "name": "Yi 34B", "id": "01-ai/yi-34b", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.57, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json b/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json similarity index 82% rename from data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json rename to data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json index 04e690e09..497d98a06 100644 --- a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json +++ b/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/01-ai_yi-6b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/01-ai_yi-6b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (6B)", + "name": "Yi 6B", "id": "01-ai/yi-6b", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.253, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json b/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json similarity index 82% rename from data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json rename to data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json index 6d66d647a..7bea38ffb 100644 --- a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json +++ b/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/01-ai_yi-large-preview/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi Large (Preview)", + "name": "Yi Large Preview", "id": "01-ai/yi-large-preview", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.471, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json b/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json similarity index 82% rename from data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json rename to data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json index 4d89d0b52..00a6f037c 100644 --- a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json +++ b/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Luminous Base (13B)", + "name": "Luminous Base 13B", "id": "AlephAlpha/luminous-base", "developer": "AlephAlpha", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.041, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json b/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json similarity index 82% rename from data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json rename to data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json index 74581377a..215983cef 100644 --- a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json +++ b/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Luminous Extended (30B)", + "name": "Luminous Extended 30B", "id": "AlephAlpha/luminous-extended", "developer": "AlephAlpha", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.078, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json b/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json similarity index 82% rename from data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json rename to data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json index 9f7e37eaf..81f487c09 100644 --- a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json +++ b/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Luminous Supreme (70B)", + "name": "Luminous Supreme 70B", "id": "AlephAlpha/luminous-supreme", "developer": "AlephAlpha", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.145, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json b/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json similarity index 82% rename from data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json rename to data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json index 9efa2b824..ef3567598 100644 --- a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json +++ b/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_j2-grande/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_j2-grande/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Jurassic-2 Grande (17B)", + "name": "Jurassic-2 Grande 17B", "id": "ai21/j2-grande", "developer": "ai21", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.172, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json b/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json similarity index 82% rename from data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json rename to data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json index 1c64f2731..f39f9c93e 100644 --- a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json +++ b/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_j2-jumbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_j2-jumbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Jurassic-2 Jumbo (178B)", + "name": "Jurassic-2 Jumbo 178B", "id": "ai21/j2-jumbo", "developer": "ai21", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.215, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json b/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json similarity index 82% rename from data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json rename to data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json index 634cd87ae..d7dee0e9a 100644 --- a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json +++ b/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.637, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json b/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json similarity index 82% rename from data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json rename to data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json index 3483b0b9a..f65e65120 100644 --- a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json +++ b/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.414, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json b/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json similarity index 82% rename from data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json rename to data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json index 527fb50a5..a3e5bda34 100644 --- a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json +++ b/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_jamba-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_jamba-instruct/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.287, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,12 +628,14 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json b/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json similarity index 82% rename from data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json rename to data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json index 51634a355..51375c00c 100644 --- a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json +++ b/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/allenai_olmo-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/allenai_olmo-7b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "OLMo (7B)", + "name": "OLMo 7B", "id": "allenai/olmo-7b", "developer": "allenai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.052, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json b/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json similarity index 82% rename from data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json rename to data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json index e8381a3f3..289dc9306 100644 --- a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json +++ b/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.708, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json b/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json similarity index 82% rename from data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json rename to data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json index 8fb5d6b37..bcd94c63d 100644 --- a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json +++ b/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.524, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json b/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json similarity index 82% rename from data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json rename to data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json index 52c65584f..c8589b186 100644 --- a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json +++ b/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.885, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json b/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json similarity index 82% rename from data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json rename to data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json index b883ce7c5..2f9d0f3e2 100644 --- a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json +++ b/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-2.0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-2.0/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.489, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json b/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json similarity index 82% rename from data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json rename to data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json index 388a1840c..fe9f851b2 100644 --- a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json +++ b/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-2.1/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-2.1/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.437, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json similarity index 82% rename from data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json rename to data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json index 231b91f4e..9eecf8a25 100644 --- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json +++ b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Haiku (20241022)", + "name": "Claude 3.5 Haiku 20241022", "id": "anthropic/claude-3-5-haiku-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.531, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json similarity index 82% rename from data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json rename to data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json index 0ee2e76e5..f3aab2968 100644 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json +++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20240620)", + "name": "Claude 3.5 Sonnet 20240620", "id": "anthropic/claude-3-5-sonnet-20240620", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.885, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json similarity index 82% rename from data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json rename to data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json index d816a8a2a..6a814b17d 100644 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json +++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20241022)", + "name": "Claude 3.5 Sonnet 20241022", "id": "anthropic/claude-3-5-sonnet-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.846, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json similarity index 82% rename from data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json rename to data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json index 66e3c14b8..54328bd79 100644 --- a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json +++ b/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Haiku (20240307)", + "name": "Claude 3 Haiku 20240307", "id": "anthropic/claude-3-haiku-20240307", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.263, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json b/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json similarity index 82% rename from data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json rename to data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json index 27c9ec758..ad60ccaa3 100644 --- a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json +++ b/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Opus (20240229)", + "name": "Claude 3 Opus 20240229", "id": "anthropic/claude-3-opus-20240229", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.683, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json similarity index 82% rename from data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json rename to data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json index 3cbea3718..35374c2f9 100644 --- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json +++ b/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Sonnet (20240229)", + "name": "Claude 3 Sonnet 20240229", "id": "anthropic/claude-3-sonnet-20240229", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.377, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json b/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json similarity index 82% rename from data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json rename to data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json index a1592f60e..7dbf7e9ee 100644 --- a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json +++ b/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.399, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json b/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json similarity index 82% rename from data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json rename to data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json index e73713e6a..04da077b3 100644 --- a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json +++ b/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-v1.3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.518, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json b/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json similarity index 82% rename from data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json rename to data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json index aabe52512..b4ccf63fd 100644 --- a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json +++ b/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command-light/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command-light/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.105, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json b/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json similarity index 82% rename from data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json rename to data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json index 288bdd798..e941df44c 100644 --- a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json +++ b/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command-r-plus/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command-r-plus/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.441, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json b/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json similarity index 82% rename from data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json rename to data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json index 33b212443..2314d1d0d 100644 --- a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json +++ b/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command-r/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command-r/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.299, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json b/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json similarity index 82% rename from data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json rename to data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json index b95f59ea4..95909d3aa 100644 --- a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json +++ b/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.327, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json b/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json similarity index 82% rename from data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json rename to data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json index 7cf9a9388..81dc83db8 100644 --- a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json +++ b/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/databricks_dbrx-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.289, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json similarity index 82% rename from data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json rename to data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json index bf2730468..31cee265a 100644 --- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json +++ b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "DeepSeek LLM Chat (67B)", + "name": "DeepSeek LLM Chat 67B", "id": "deepseek-ai/deepseek-llm-67b-chat", "developer": "deepseek-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.488, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json b/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json similarity index 82% rename from data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json rename to data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json index e07480be1..cc64f30ee 100644 --- a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json +++ b/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.908, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json b/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json similarity index 82% rename from data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json rename to data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json index eefe2f954..f6af740ee 100644 --- a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json +++ b/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.0 Pro (002)", + "name": "Gemini 1.0 Pro 002", "id": "google/gemini-1.0-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.422, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json b/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json similarity index 82% rename from data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json rename to data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json index e10645540..7c312bb83 100644 --- a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json +++ b/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (001)", + "name": "Gemini 1.5 Flash 001", "id": "google/gemini-1.5-flash-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.667, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json b/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json similarity index 82% rename from data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json rename to data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json index 8e4eb067b..450dbafcb 100644 --- a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json +++ b/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (002)", + "name": "Gemini 1.5 Flash 002", "id": "google/gemini-1.5-flash-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.573, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json b/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json similarity index 82% rename from data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json rename to data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json index 38c3a236a..653e006ee 100644 --- a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json +++ b/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (001)", + "name": "Gemini 1.5 Pro 001", "id": "google/gemini-1.5-pro-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.739, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json b/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json similarity index 82% rename from data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json rename to data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json index cada735aa..64f712478 100644 --- a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json +++ b/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (002)", + "name": "Gemini 1.5 Pro 002", "id": "google/gemini-1.5-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.842, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json b/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json similarity index 82% rename from data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json rename to data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json index 1487ce304..371c57f27 100644 --- a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json +++ b/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.0 Flash (Experimental)", + "name": "Gemini 2.0 Flash Experimental", "id": "google/gemini-2.0-flash-exp", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.813, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json b/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json similarity index 82% rename from data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json rename to data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json index 29456a114..24f598da3 100644 --- a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json +++ b/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemma-2-27b-it/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 Instruct (27B)", + "name": "Gemma 2 Instruct 27B", "id": "google/gemma-2-27b-it", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.675, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json b/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json similarity index 82% rename from data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json rename to data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json index 75457f70d..1e65ff610 100644 --- a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json +++ b/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemma-2-9b-it/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 Instruct (9B)", + "name": "Gemma 2 Instruct 9B", "id": "google/gemma-2-9b-it", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.562, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json b/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json similarity index 82% rename from data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json rename to data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json index dabc86d10..ee614ce44 100644 --- a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json +++ b/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemma-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemma-7b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma (7B)", + "name": "Gemma 7B", "id": "google/gemma-7b", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.336, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json b/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json similarity index 82% rename from data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json rename to data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json index 9c9727ed0..7d7c944f0 100644 --- a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json +++ b/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_text-bison@001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_text-bison@001/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Bison)", + "name": "PaLM-2 Bison", "id": "google/text-bison@001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.526, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json b/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json similarity index 82% rename from data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json rename to data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json index 2e152e4a7..f19d99b14 100644 --- a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json +++ b/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_text-unicorn@001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_text-unicorn@001/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Unicorn)", + "name": "PaLM-2 Unicorn", "id": "google/text-unicorn@001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.644, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json b/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json similarity index 82% rename from data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json rename to data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json index a5b394c06..f38e87995 100644 --- a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json +++ b/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-2-13b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-2-13b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (13B)", + "name": "Llama 2 13B", "id": "meta/llama-2-13b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.233, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json b/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json similarity index 82% rename from data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json rename to data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json index cf4407980..b0d616c29 100644 --- a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json +++ b/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-2-70b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-2-70b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (70B)", + "name": "Llama 2 70B", "id": "meta/llama-2-70b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.482, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json b/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json similarity index 82% rename from data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json rename to data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json index 3b18db79e..1e9aac924 100644 --- a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json +++ b/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-2-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-2-7b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (7B)", + "name": "Llama 2 7B", "id": "meta/llama-2-7b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.152, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json b/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json similarity index 82% rename from data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json rename to data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json index 90d04801d..f13ee8122 100644 --- a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json +++ b/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3-70b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3-70b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (70B)", + "name": "Llama 3 70B", "id": "meta/llama-3-70b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.793, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json b/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json similarity index 82% rename from data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json rename to data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json index 0e3ff704d..7f18bf5fd 100644 --- a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json +++ b/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3-8b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3-8b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (8B)", + "name": "Llama 3 8B", "id": "meta/llama-3-8b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.387, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json similarity index 82% rename from data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json rename to data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json index 8311edd73..6ef4300e5 100644 --- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json +++ b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (405B)", + "name": "Llama 3.1 Instruct Turbo 405B", "id": "meta/llama-3.1-405b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.854, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json similarity index 82% rename from data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json rename to data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json index 3e59bea75..8afc05c39 100644 --- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json +++ b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (70B)", + "name": "Llama 3.1 Instruct Turbo 70B", "id": "meta/llama-3.1-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.808, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json similarity index 82% rename from data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json rename to data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json index 300f5dbb2..e5cc6d55a 100644 --- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json +++ b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (8B)", + "name": "Llama 3.1 Instruct Turbo 8B", "id": "meta/llama-3.1-8b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.303, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json similarity index 82% rename from data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json rename to data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json index 4daa7f500..793304d91 100644 --- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json +++ b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (11B)", + "name": "Llama 3.2 Vision Instruct Turbo 11B", "id": "meta/llama-3.2-11b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.325, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json similarity index 82% rename from data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json rename to data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json index 17f50b1c8..615526ba3 100644 --- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json +++ b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (90B)", + "name": "Llama 3.2 Vision Instruct Turbo 90B", "id": "meta/llama-3.2-90b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.819, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json similarity index 82% rename from data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json rename to data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json index 06851628a..8116db8ba 100644 --- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json +++ b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.3 Instruct Turbo (70B)", + "name": "Llama 3.3 Instruct Turbo 70B", "id": "meta/llama-3.3-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.812, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json b/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json similarity index 82% rename from data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json rename to data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json index 624d96ab6..f3354af37 100644 --- a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json +++ b/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-65b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-65b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "LLaMA (65B)", + "name": "LLaMA 65B", "id": "meta/llama-65b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.345, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json b/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json similarity index 82% rename from data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json rename to data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json index 42e0ca1f2..172c44cac 100644 --- a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json +++ b/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/microsoft_phi-2/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/microsoft_phi-2/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.169, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json similarity index 82% rename from data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json rename to data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json index 40407df59..c613f7fec 100644 --- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json +++ b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (14B)", + "name": "Phi-3 14B", "id": "microsoft/phi-3-medium-4k-instruct", "developer": "microsoft", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.509, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json similarity index 82% rename from data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json rename to data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json index 4a88d2532..f78b3f049 100644 --- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json +++ b/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (7B)", + "name": "Phi-3 7B", "id": "microsoft/phi-3-small-8k-instruct", "developer": "microsoft", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.473, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json similarity index 82% rename from data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json rename to data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json index 81cb62772..97f8b3a1e 100644 --- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json +++ b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Instruct v0.3 (7B)", + "name": "Mistral Instruct v0.3 7B", "id": "mistralai/mistral-7b-instruct-v0.3", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.196, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json b/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json similarity index 82% rename from data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json rename to data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json index 17ebd8348..30337d5a4 100644 --- a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json +++ b/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral v0.1 (7B)", + "name": "Mistral v0.1 7B", "id": "mistralai/mistral-7b-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.292, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json b/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json similarity index 82% rename from data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json rename to data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json index ca506f27c..edea4050d 100644 --- a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json +++ b/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large (2402)", + "name": "Mistral Large 2402", "id": "mistralai/mistral-large-2402", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.328, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json b/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json similarity index 82% rename from data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json rename to data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json index a10172374..d2dd06c67 100644 --- a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json +++ b/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large 2 (2407)", + "name": "Mistral Large 2 2407", "id": "mistralai/mistral-large-2407", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.744, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json b/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json similarity index 82% rename from data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json rename to data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json index 966d4c393..cbbf76044 100644 --- a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json +++ b/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Medium (2312)", + "name": "Mistral Medium 2312", "id": "mistralai/mistral-medium-2312", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.268, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json b/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json similarity index 82% rename from data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json rename to data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json index 039a9d5cc..d1c6bf6d0 100644 --- a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json +++ b/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Small (2402)", + "name": "Mistral Small 2402", "id": "mistralai/mistral-small-2402", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.288, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json b/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json similarity index 82% rename from data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json rename to data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json index 781bbb2c8..d020ec2ef 100644 --- a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json +++ b/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x22B)", + "name": "Mixtral 8x22B", "id": "mistralai/mixtral-8x22b", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.705, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json similarity index 82% rename from data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json rename to data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json index 818a4bd2a..ca92e5358 100644 --- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json +++ b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x7B 32K seqlen)", + "name": "Mixtral 8x7B 32K seqlen", "id": "mistralai/mixtral-8x7b-32kseqlen", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.51, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json similarity index 82% rename from data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json rename to data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json index dfc851db9..75b65c3cd 100644 --- a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json +++ b/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral NeMo (2402)", + "name": "Mistral NeMo 2402", "id": "mistralai/open-mistral-nemo-2407", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.333, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json similarity index 82% rename from data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json rename to data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json index 28acf453d..c3db66d6f 100644 --- a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json +++ b/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0613)", + "name": "GPT-3.5 Turbo 0613", "id": "openai/gpt-3.5-turbo-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.358, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json b/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json similarity index 82% rename from data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json rename to data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json index 6fa2534b1..3b34bbe84 100644 --- a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json +++ b/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4-0613/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4-0613/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 (0613)", + "name": "GPT-4 0613", "id": "openai/gpt-4-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.867, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json b/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json similarity index 82% rename from data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json rename to data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json index c0d921b54..f80298de5 100644 --- a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json +++ b/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (1106 preview)", + "name": "GPT-4 Turbo 1106 preview", "id": "openai/gpt-4-1106-preview", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.698, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json similarity index 82% rename from data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json rename to data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json index 599344447..49bdd419a 100644 --- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json +++ b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (2024-04-09)", + "name": "GPT-4 Turbo 2024-04-09", "id": "openai/gpt-4-turbo-2024-04-09", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.864, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json b/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json similarity index 82% rename from data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json rename to data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json index 98feb8bc0..ab2f778b6 100644 --- a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json +++ b/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-05-13)", + "name": "GPT-4o 2024-05-13", "id": "openai/gpt-4o-2024-05-13", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.938, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json b/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json similarity index 82% rename from data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json rename to data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json index cb595e51b..3d286d830 100644 --- a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json +++ b/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-08-06)", + "name": "GPT-4o 2024-08-06", "id": "openai/gpt-4o-2024-08-06", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.928, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json similarity index 82% rename from data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json rename to data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json index 3fb056373..53ecaa7dc 100644 --- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json +++ b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o mini (2024-07-18)", + "name": "GPT-4o mini 2024-07-18", "id": "openai/gpt-4o-mini-2024-07-18", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.701, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json b/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json similarity index 82% rename from data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json rename to data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json index d390f5b2a..c90d2c5a2 100644 --- a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json +++ b/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_text-davinci-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_text-davinci-002/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 (text-davinci-002)", + "name": "GPT-3.5 text-davinci-002", "id": "openai/text-davinci-002", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.336, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json b/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json similarity index 82% rename from data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json rename to data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json index 99961f779..6f2c648e1 100644 --- a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json +++ b/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_text-davinci-003/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_text-davinci-003/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 (text-davinci-003)", + "name": "GPT-3.5 text-davinci-003", "id": "openai/text-davinci-003", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.439, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json b/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json similarity index 82% rename from data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json rename to data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json index 6aed691a1..3b85e6b08 100644 --- a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json +++ b/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 Chat (110B)", + "name": "Qwen1.5 Chat 110B", "id": "qwen/qwen1.5-110b-chat", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.55, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json b/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json similarity index 82% rename from data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json rename to data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json index f6c7858eb..4df79c00f 100644 --- a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json +++ b/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (14B)", + "name": "Qwen1.5 14B", "id": "qwen/qwen1.5-14b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.425, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json b/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json similarity index 82% rename from data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json rename to data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json index 1314aa204..74a01181c 100644 --- a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json +++ b/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (32B)", + "name": "Qwen1.5 32B", "id": "qwen/qwen1.5-32b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.546, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json b/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json similarity index 82% rename from data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json rename to data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json index 6da42bd5b..a056d0e42 100644 --- a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json +++ b/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (72B)", + "name": "Qwen1.5 72B", "id": "qwen/qwen1.5-72b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.608, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json b/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json similarity index 82% rename from data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json rename to data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json index a4d0226b9..0757d65b1 100644 --- a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json +++ b/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (7B)", + "name": "Qwen1.5 7B", "id": "qwen/qwen1.5-7b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.275, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json b/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json similarity index 82% rename from data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json rename to data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json index 4e8665e6b..2d8d0469d 100644 --- a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json +++ b/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2 Instruct (72B)", + "name": "Qwen2 Instruct 72B", "id": "qwen/qwen2-72b-instruct", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.77, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json similarity index 82% rename from data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json rename to data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json index 9e7699d4b..6091d879d 100644 --- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json +++ b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (72B)", + "name": "Qwen2.5 Instruct Turbo 72B", "id": "qwen/qwen2.5-72b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.745, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json similarity index 82% rename from data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json rename to data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json index 126ae4e72..a9b9ae2a3 100644 --- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json +++ b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (7B)", + "name": "Qwen2.5 Instruct Turbo 7B", "id": "qwen/qwen2.5-7b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.488, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,18 +506,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -496,10 +571,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json similarity index 82% rename from data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json rename to data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json index a52059819..f7f93c913 100644 --- a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json +++ b/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.338, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json b/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json similarity index 82% rename from data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json rename to data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json index 518458e37..65a14de91 100644 --- a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json +++ b/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/tiiuae_falcon-40b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Falcon (40B)", + "name": "Falcon 40B", "id": "tiiuae/falcon-40b", "developer": "tiiuae", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.217, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json b/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json similarity index 82% rename from data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json rename to data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json index 4a1515414..62d1fae1c 100644 --- a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json +++ b/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/tiiuae_falcon-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Falcon (7B)", + "name": "Falcon 7B", "id": "tiiuae/falcon-7b", "developer": "tiiuae", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.064, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json b/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json similarity index 82% rename from data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json rename to data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json index fd33bd463..9e56dbbb6 100644 --- a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json +++ b/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/upstage_solar-pro-241126/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.602, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -390,11 +447,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -440,17 +506,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -495,10 +570,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json b/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json similarity index 82% rename from data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json rename to data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json index 574c20cd8..2b000451d 100644 --- a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json +++ b/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/writer_palmyra-x-004/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/writer_palmyra-x-004/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.808, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -93,11 +105,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -173,12 +194,21 @@ } }, "generation_config": { - "mode": "closedbook", - "stop": "none" + "additional_details": { + "mode": "closedbook", + "stop": "none" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -224,12 +254,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -275,18 +314,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -332,23 +380,32 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True", - "stop": "none" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True", + "stop": "none" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -394,11 +451,20 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -444,18 +510,27 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -500,10 +575,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -549,14 +633,16 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ], - "stop": "none" + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ], + "stop": "none" + } } } ] diff --git a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json b/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json similarity index 82% rename from data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json rename to data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json index 85f887f2f..fc600d1dc 100644 --- a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json +++ b/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v2/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Palmyra X V2 (33B)", + "name": "Palmyra X V2 33B", "id": "writer/palmyra-x-v2", "developer": "writer", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.589, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json b/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json similarity index 82% rename from data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json rename to data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json index ae69f6c5b..3ac2641c0 100644 --- a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json +++ b/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770829788.2883599", + "retrieved_timestamp": "1770829788.2883599", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Palmyra X V3 (72B)", + "name": "Palmyra X V3 72B", "id": "writer/palmyra-x-v3", "developer": "writer", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.679, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,10 +47,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NarrativeQA - F1", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -92,10 +104,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "NaturalQuestions (closed-book) - F1", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", "lower_is_better": false, @@ -171,11 +192,20 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { "evaluation_name": "OpenbookQA - EM", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -221,12 +251,21 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MMLU - EM", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -272,18 +311,27 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { "evaluation_name": "MATH - Equivalent (CoT)", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", "lower_is_better": false, @@ -329,22 +377,31 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { "evaluation_name": "GSM8K - EM", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", "lower_is_better": false, @@ -389,10 +446,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "LegalBench - EM", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -438,17 +504,26 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { "evaluation_name": "MedQA - EM", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", "lower_is_better": false, @@ -493,10 +568,19 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { "evaluation_name": "WMT 2014 - BLEU-4", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", "lower_is_better": false, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json b/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json similarity index 82% rename from data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json rename to data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json index a5bdb42fc..cdb3ca461 100644 --- a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json +++ b/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/01-ai_yi-34b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (34B)", + "name": "Yi 34B", "id": "01-ai/yi-34b", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.315, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json b/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json similarity index 82% rename from data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json rename to data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json index 5b2c50278..1b8b7e56f 100644 --- a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json +++ b/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/01-ai_yi-6b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (6B)", + "name": "Yi 6B", "id": "01-ai/yi-6b", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.651, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json b/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json similarity index 82% rename from data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json rename to data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json index 938fbc9f2..29bc15bb3 100644 --- a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json +++ b/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi Large (Preview)", + "name": "Yi Large Preview", "id": "01-ai/yi-large-preview", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.258, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json b/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json similarity index 82% rename from data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json rename to data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json index b05362e32..2bf971f25 100644 --- a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json +++ b/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.147, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json b/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json similarity index 82% rename from data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json rename to data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json index 374350118..7ffc27970 100644 --- a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json +++ b/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.206, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json b/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json similarity index 82% rename from data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json rename to data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json index 2f32db71e..92ba45d60 100644 --- a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json +++ b/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.887, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json b/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json similarity index 82% rename from data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json rename to data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json index 0ee329ec3..e53150712 100644 --- a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json +++ b/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "OLMo 1.7 (7B)", + "name": "OLMo 1.7 7B", "id": "allenai/olmo-1.7-7b", "developer": "allenai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.196, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json b/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json similarity index 82% rename from data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json rename to data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json index dc71abcb3..301523f0f 100644 --- a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json +++ b/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/allenai_olmo-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "OLMo (7B)", + "name": "OLMo 7B", "id": "allenai/olmo-7b", "developer": "allenai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.68, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json b/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json similarity index 82% rename from data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json rename to data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json index 036d68cdd..d80215b78 100644 --- a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json +++ b/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.987, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json b/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json similarity index 82% rename from data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json rename to data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json index dc2e53d31..f28fc4ccf 100644 --- a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json +++ b/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 1.0, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json b/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json similarity index 82% rename from data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json rename to data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json index 74dd04dc4..66455ef1d 100644 --- a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json +++ b/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.975, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json b/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json rename to data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json index 94c86600d..163a9d31a 100644 --- a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json +++ b/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.048, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json rename to data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json index 15ba960b1..edabc3b81 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json +++ b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Haiku (20241022)", + "name": "Claude 3.5 Haiku 20241022", "id": "anthropic/claude-3-5-haiku-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.128, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json index 43e320af9..8d402d4fb 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json +++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20240620)", + "name": "Claude 3.5 Sonnet 20240620", "id": "anthropic/claude-3-5-sonnet-20240620", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.17, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json index 7df36bb32..a435d5c4d 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json +++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20241022)", + "name": "Claude 3.5 Sonnet 20241022", "id": "anthropic/claude-3-5-sonnet-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.311, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json rename to data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json index 9885a79d4..66b68fa6b 100644 --- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json +++ b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Haiku (20240307)", + "name": "Claude 3 Haiku 20240307", "id": "anthropic/claude-3-haiku-20240307", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.28, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json rename to data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json index ab57a1503..140c0db28 100644 --- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json +++ b/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Opus (20240229)", + "name": "Claude 3 Opus 20240229", "id": "anthropic/claude-3-opus-20240229", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.014, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json rename to data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json index 710c70a2e..a6eb131df 100644 --- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json +++ b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Sonnet (20240229)", + "name": "Claude 3 Sonnet 20240229", "id": "anthropic/claude-3-sonnet-20240229", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.082, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json b/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json similarity index 82% rename from data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json rename to data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json index b632a0864..38a7ffacb 100644 --- a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json +++ b/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.186, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json b/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json similarity index 82% rename from data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json rename to data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json index 6ef0cc597..4b85be9b2 100644 --- a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json +++ b/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/cohere_command-r-plus/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.825, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json b/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json similarity index 82% rename from data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json rename to data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json index 6fa172bf8..90cbd571c 100644 --- a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json +++ b/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/cohere_command-r/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/cohere_command-r/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.959, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json b/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json similarity index 82% rename from data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json rename to data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json index 8d0b57f82..753506525 100644 --- a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json +++ b/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.537, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json similarity index 82% rename from data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json rename to data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json index 7837e5696..f12e3799a 100644 --- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json +++ b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "DeepSeek LLM Chat (67B)", + "name": "DeepSeek LLM Chat 67B", "id": "deepseek-ai/deepseek-llm-67b-chat", "developer": "deepseek-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.387, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json similarity index 82% rename from data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json rename to data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json index b9d5d50e7..86fd9dec9 100644 --- a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json +++ b/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.215, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json b/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json rename to data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json index 4fb164090..0184241c6 100644 --- a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json +++ b/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.0 Pro (001)", + "name": "Gemini 1.0 Pro 001", "id": "google/gemini-1.0-pro-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.677, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json b/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json rename to data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json index a91e47447..7baa6457e 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (001)", + "name": "Gemini 1.5 Flash 001", "id": "google/gemini-1.5-flash-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.47, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json b/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json rename to data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json index c8a9b1912..f095d6361 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (002)", + "name": "Gemini 1.5 Flash 002", "id": "google/gemini-1.5-flash-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.817, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json rename to data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json index ffdf7910d..fe99bd4e4 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (0514 preview)", + "name": "Gemini 1.5 Flash 0514 preview", "id": "google/gemini-1.5-flash-preview-0514", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.713, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json b/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json rename to data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json index 0115a3fa0..4b9fc2846 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (001)", + "name": "Gemini 1.5 Pro 001", "id": "google/gemini-1.5-pro-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.349, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json b/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json rename to data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json index 1c57dbb48..47f80252d 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (002)", + "name": "Gemini 1.5 Pro 002", "id": "google/gemini-1.5-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.334, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json similarity index 82% rename from data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json rename to data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json index 065435cc3..901c1dd01 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (0409 preview)", + "name": "Gemini 1.5 Pro 0409 preview", "id": "google/gemini-1.5-pro-preview-0409", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.118, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json similarity index 82% rename from data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json rename to data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json index 9b4101c21..0eda6b6b1 100644 --- a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json +++ b/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.0 Flash (Experimental)", + "name": "Gemini 2.0 Flash Experimental", "id": "google/gemini-2.0-flash-exp", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.567, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json b/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json similarity index 82% rename from data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json rename to data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json index 2a0eccbe5..142296fc4 100644 --- a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json +++ b/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemma-2-27b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 (27B)", + "name": "Gemma 2 27B", "id": "google/gemma-2-27b", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.05, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json b/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json similarity index 82% rename from data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json rename to data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json index 7b83a32f9..6f84fd47f 100644 --- a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json +++ b/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemma-2-9b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 (9B)", + "name": "Gemma 2 9B", "id": "google/gemma-2-9b", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.265, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json b/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json similarity index 82% rename from data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json rename to data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json index 1480d9d56..ac525859f 100644 --- a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json +++ b/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemma-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemma-7b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma (7B)", + "name": "Gemma 7B", "id": "google/gemma-7b", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.824, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json b/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json similarity index 82% rename from data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json rename to data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json index a20b853b7..b20dbe54d 100644 --- a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json +++ b/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_text-bison@001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_text-bison@001/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Bison)", + "name": "PaLM-2 Bison", "id": "google/text-bison@001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.192, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json b/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json similarity index 82% rename from data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json rename to data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json index 061cfda40..7b3536f41 100644 --- a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json +++ b/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_text-unicorn@001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Unicorn)", + "name": "PaLM-2 Unicorn", "id": "google/text-unicorn@001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.142, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json b/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json similarity index 82% rename from data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json rename to data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json index 999bc7bce..a786ac0dd 100644 --- a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json +++ b/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-2-13b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (13B)", + "name": "Llama 2 13B", "id": "meta/llama-2-13b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.502, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json b/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json similarity index 82% rename from data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json rename to data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json index 2bd647ad6..bd988b6d8 100644 --- a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json +++ b/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-2-70b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (70B)", + "name": "Llama 2 70B", "id": "meta/llama-2-70b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.508, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json b/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json similarity index 82% rename from data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json rename to data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json index f7641555c..b29cd7460 100644 --- a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json +++ b/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-2-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (7B)", + "name": "Llama 2 7B", "id": "meta/llama-2-7b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.681, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json b/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json similarity index 82% rename from data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json rename to data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json index 028924f0a..d46d7f50a 100644 --- a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json +++ b/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3-70b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (70B)", + "name": "Llama 3 70B", "id": "meta/llama-3-70b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.524, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json b/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json similarity index 82% rename from data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json rename to data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json index 493305a26..31dfddc02 100644 --- a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json +++ b/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3-8b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (8B)", + "name": "Llama 3 8B", "id": "meta/llama-3-8b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.733, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json similarity index 82% rename from data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json rename to data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json index 5e68e1b5a..64eb43090 100644 --- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json +++ b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (405B)", + "name": "Llama 3.1 Instruct Turbo 405B", "id": "meta/llama-3.1-405b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.33, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json similarity index 82% rename from data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json rename to data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json index 7f880e52b..149eb0100 100644 --- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json +++ b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (70B)", + "name": "Llama 3.1 Instruct Turbo 70B", "id": "meta/llama-3.1-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.021, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json similarity index 82% rename from data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json rename to data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json index bdc0510b6..46bd04117 100644 --- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json +++ b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (8B)", + "name": "Llama 3.1 Instruct Turbo 8B", "id": "meta/llama-3.1-8b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.475, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json similarity index 82% rename from data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json rename to data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json index e9ec2f904..187d1c6a7 100644 --- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json +++ b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (11B)", + "name": "Llama 3.2 Vision Instruct Turbo 11B", "id": "meta/llama-3.2-11b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.897, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json similarity index 82% rename from data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json rename to data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json index 51cb25f1e..9625c1e16 100644 --- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json +++ b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (90B)", + "name": "Llama 3.2 Vision Instruct Turbo 90B", "id": "meta/llama-3.2-90b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.773, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json similarity index 82% rename from data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json rename to data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json index 124028675..8effae129 100644 --- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json +++ b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.3 Instruct Turbo (70B)", + "name": "Llama 3.3 Instruct Turbo 70B", "id": "meta/llama-3.3-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.722, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json b/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json similarity index 82% rename from data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json rename to data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json index f3162d0fe..07027b7fb 100644 --- a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json +++ b/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/microsoft_phi-2/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/microsoft_phi-2/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.824, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json similarity index 82% rename from data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json rename to data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json index 97f9c3c96..6ce22179c 100644 --- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json +++ b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (14B)", + "name": "Phi-3 14B", "id": "microsoft/phi-3-medium-4k-instruct", "developer": "microsoft", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.015, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json similarity index 82% rename from data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json rename to data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json index 9da3cad91..7278b002a 100644 --- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json +++ b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (7B)", + "name": "Phi-3 7B", "id": "microsoft/phi-3-small-8k-instruct", "developer": "microsoft", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.708, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json similarity index 82% rename from data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json rename to data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json index 2592b75a7..886ff1732 100644 --- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json +++ b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Instruct v0.3 (7B)", + "name": "Mistral Instruct v0.3 7B", "id": "mistralai/mistral-7b-instruct-v0.3", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.509, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json similarity index 82% rename from data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json rename to data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json index 77ee3f1a1..935804d7f 100644 --- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json +++ b/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral v0.1 (7B)", + "name": "Mistral v0.1 7B", "id": "mistralai/mistral-7b-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.213, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json b/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json similarity index 82% rename from data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json rename to data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json index c34e3e47f..bc72ce600 100644 --- a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json +++ b/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large (2402)", + "name": "Mistral Large 2402", "id": "mistralai/mistral-large-2402", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.464, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json b/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json similarity index 82% rename from data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json rename to data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json index 4e005a631..272dc142d 100644 --- a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json +++ b/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large 2 (2407)", + "name": "Mistral Large 2 2407", "id": "mistralai/mistral-large-2407", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.24, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json b/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json similarity index 82% rename from data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json rename to data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json index ddc506063..607dc1e03 100644 --- a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json +++ b/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Small (2402)", + "name": "Mistral Small 2402", "id": "mistralai/mistral-small-2402", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.54, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json b/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json similarity index 82% rename from data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json rename to data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json index 35cc50f7b..401d4b7c8 100644 --- a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json +++ b/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x22B)", + "name": "Mixtral 8x22B", "id": "mistralai/mixtral-8x22b", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.598, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json similarity index 82% rename from data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json rename to data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json index 247f8572e..b88295eb7 100644 --- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json +++ b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x7B 32K seqlen)", + "name": "Mixtral 8x7B 32K seqlen", "id": "mistralai/mixtral-8x7b-32kseqlen", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.689, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json similarity index 82% rename from data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json rename to data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json index 20e5d8bc5..5a436d9c9 100644 --- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json +++ b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral NeMo (2402)", + "name": "Mistral NeMo 2402", "id": "mistralai/open-mistral-nemo-2407", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.215, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json similarity index 82% rename from data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json rename to data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json index 61bdc2a92..5923a61b0 100644 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json +++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0125)", + "name": "GPT-3.5 Turbo 0125", "id": "openai/gpt-3.5-turbo-0125", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.493, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json similarity index 82% rename from data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json rename to data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json index a7037b692..c62c20e9c 100644 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json +++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0613)", + "name": "GPT-3.5 Turbo 0613", "id": "openai/gpt-3.5-turbo-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.589, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json b/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json similarity index 82% rename from data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json rename to data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json index 8a368f8b6..9877671a2 100644 --- a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json +++ b/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 (0613)", + "name": "GPT-4 0613", "id": "openai/gpt-4-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.517, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json b/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json similarity index 82% rename from data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json rename to data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json index 41438331c..448f5bbca 100644 --- a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json +++ b/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (1106 preview)", + "name": "GPT-4 Turbo 1106 preview", "id": "openai/gpt-4-1106-preview", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.416, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json similarity index 82% rename from data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json rename to data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json index a7796e764..aefe21734 100644 --- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json +++ b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (2024-04-09)", + "name": "GPT-4 Turbo 2024-04-09", "id": "openai/gpt-4-turbo-2024-04-09", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.351, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json similarity index 82% rename from data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json rename to data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json index 1572c27c7..efc7bbe5a 100644 --- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json +++ b/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-05-13)", + "name": "GPT-4o 2024-05-13", "id": "openai/gpt-4o-2024-05-13", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.671, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json similarity index 82% rename from data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json rename to data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json index 4ba84b207..fe9568710 100644 --- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json +++ b/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-08-06)", + "name": "GPT-4o 2024-08-06", "id": "openai/gpt-4o-2024-08-06", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.52, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json similarity index 82% rename from data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json rename to data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json index f69b1b3d4..681eae3b7 100644 --- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json +++ b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o mini (2024-07-18)", + "name": "GPT-4o mini 2024-07-18", "id": "openai/gpt-4o-mini-2024-07-18", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.774, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json similarity index 82% rename from data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json rename to data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json index 190b1dce2..6667a05bb 100644 --- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json +++ b/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 Chat (110B)", + "name": "Qwen1.5 Chat 110B", "id": "qwen/qwen1.5-110b-chat", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.875, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json b/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json similarity index 82% rename from data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json rename to data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json index 7ff151a72..ce5d472c6 100644 --- a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json +++ b/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (14B)", + "name": "Qwen1.5 14B", "id": "qwen/qwen1.5-14b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.796, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json b/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json similarity index 82% rename from data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json rename to data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json index 421333da5..ff8059b60 100644 --- a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json +++ b/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (32B)", + "name": "Qwen1.5 32B", "id": "qwen/qwen1.5-32b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.624, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json b/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json similarity index 82% rename from data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json rename to data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json index d14327eec..c69a6d09c 100644 --- a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json +++ b/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (72B)", + "name": "Qwen1.5 72B", "id": "qwen/qwen1.5-72b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.65, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json b/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json similarity index 82% rename from data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json rename to data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json index d9688a597..8651674c9 100644 --- a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json +++ b/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (7B)", + "name": "Qwen1.5 7B", "id": "qwen/qwen1.5-7b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.843, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json similarity index 82% rename from data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json rename to data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json index abb62e63a..89026d1dc 100644 --- a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json +++ b/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2 Instruct (72B)", + "name": "Qwen2 Instruct 72B", "id": "qwen/qwen2-72b-instruct", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.826, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json similarity index 82% rename from data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json rename to data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json index ee06a7f3d..1a03b982a 100644 --- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json +++ b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (72B)", + "name": "Qwen2.5 Instruct Turbo 72B", "id": "qwen/qwen2.5-72b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.548, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json similarity index 82% rename from data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json rename to data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json index f8033410f..032da16a1 100644 --- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json +++ b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (7B)", + "name": "Qwen2.5 Instruct Turbo 7B", "id": "qwen/qwen2.5-7b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.887, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json similarity index 82% rename from data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json rename to data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json index cde071792..5482f32f0 100644 --- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json +++ b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.565, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json b/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json similarity index 82% rename from data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json rename to data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json index 7d7fe6a40..b71ad83e6 100644 --- a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json +++ b/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.462, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json b/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json similarity index 82% rename from data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json rename to data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json index c2c0d493b..734ce34f3 100644 --- a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json +++ b/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.629, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json b/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json similarity index 82% rename from data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json rename to data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json index fd6405aa5..0e5669e0a 100644 --- a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json +++ b/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770830564.5477738", + "retrieved_timestamp": "1770830564.5477738", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Palmyra X V3 (72B)", + "name": "Palmyra X V3 72B", "id": "writer/palmyra-x-v3", "developer": "writer", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "MMLU All Subjects - EM", + "source_data": { + "dataset_name": "MMLU All Subjects", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -65,130 +69,139 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { "evaluation_name": "Abstract Algebra - EM", + "source_data": { + "dataset_name": "Abstract Algebra", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -234,14 +247,23 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { "evaluation_name": "Anatomy - EM", + "source_data": { + "dataset_name": "Anatomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -287,14 +309,23 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { "evaluation_name": "College Physics - EM", + "source_data": { + "dataset_name": "College Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -490,14 +521,23 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { "evaluation_name": "Computer Security - EM", + "source_data": { + "dataset_name": "Computer Security", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -543,14 +583,23 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { "evaluation_name": "Econometrics - EM", + "source_data": { + "dataset_name": "Econometrics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -596,14 +645,23 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { "evaluation_name": "Global Facts - EM", + "source_data": { + "dataset_name": "Global Facts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -649,14 +707,23 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { "evaluation_name": "Jurisprudence - EM", + "source_data": { + "dataset_name": "Jurisprudence", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -702,14 +769,23 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { "evaluation_name": "Philosophy - EM", + "source_data": { + "dataset_name": "Philosophy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -755,14 +831,23 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { "evaluation_name": "Professional Psychology - EM", + "source_data": { + "dataset_name": "Professional Psychology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -898,14 +983,23 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { "evaluation_name": "Us Foreign Policy - EM", + "source_data": { + "dataset_name": "Us Foreign Policy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -951,14 +1045,23 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { "evaluation_name": "Astronomy - EM", + "source_data": { + "dataset_name": "Astronomy", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1004,14 +1107,23 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { "evaluation_name": "Business Ethics - EM", + "source_data": { + "dataset_name": "Business Ethics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1057,14 +1169,23 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { "evaluation_name": "Clinical Knowledge - EM", + "source_data": { + "dataset_name": "Clinical Knowledge", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1110,14 +1231,23 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { "evaluation_name": "Conceptual Physics - EM", + "source_data": { + "dataset_name": "Conceptual Physics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1163,14 +1293,23 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { "evaluation_name": "Electrical Engineering - EM", + "source_data": { + "dataset_name": "Electrical Engineering", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1216,14 +1355,23 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { "evaluation_name": "Elementary Mathematics - EM", + "source_data": { + "dataset_name": "Elementary Mathematics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1269,14 +1417,23 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { "evaluation_name": "Formal Logic - EM", + "source_data": { + "dataset_name": "Formal Logic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1322,14 +1479,23 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { "evaluation_name": "High School World History - EM", + "source_data": { + "dataset_name": "High School World History", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1765,14 +1931,23 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { "evaluation_name": "Human Sexuality - EM", + "source_data": { + "dataset_name": "Human Sexuality", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1848,14 +2023,23 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { "evaluation_name": "International Law - EM", + "source_data": { + "dataset_name": "International Law", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1901,14 +2085,23 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { "evaluation_name": "Logical Fallacies - EM", + "source_data": { + "dataset_name": "Logical Fallacies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -1954,14 +2147,23 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { "evaluation_name": "Machine Learning - EM", + "source_data": { + "dataset_name": "Machine Learning", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2007,14 +2209,23 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { "evaluation_name": "Management - EM", + "source_data": { + "dataset_name": "Management", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2060,14 +2271,23 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { "evaluation_name": "Marketing - EM", + "source_data": { + "dataset_name": "Marketing", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2113,14 +2333,23 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { "evaluation_name": "Medical Genetics - EM", + "source_data": { + "dataset_name": "Medical Genetics", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2166,14 +2395,23 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { "evaluation_name": "Miscellaneous - EM", + "source_data": { + "dataset_name": "Miscellaneous", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2219,14 +2457,23 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { "evaluation_name": "Moral Scenarios - EM", + "source_data": { + "dataset_name": "Moral Scenarios", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2302,14 +2549,23 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { "evaluation_name": "Nutrition - EM", + "source_data": { + "dataset_name": "Nutrition", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2355,14 +2611,23 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { "evaluation_name": "Prehistory - EM", + "source_data": { + "dataset_name": "Prehistory", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2408,14 +2673,23 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { "evaluation_name": "Public Relations - EM", + "source_data": { + "dataset_name": "Public Relations", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2461,14 +2735,23 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { "evaluation_name": "Security Studies - EM", + "source_data": { + "dataset_name": "Security Studies", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2514,14 +2797,23 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { "evaluation_name": "Sociology - EM", + "source_data": { + "dataset_name": "Sociology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2567,14 +2859,23 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { "evaluation_name": "Virology - EM", + "source_data": { + "dataset_name": "Virology", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2620,14 +2921,23 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { "evaluation_name": "World Religions - EM", + "source_data": { + "dataset_name": "World Religions", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", "lower_is_better": false, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.325, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/scripts/HELM/parse_helm_leaderboards.sh b/scripts/HELM/parse_helm_leaderboards.sh new file mode 100755 index 000000000..a89a1a64e --- /dev/null +++ b/scripts/HELM/parse_helm_leaderboards.sh @@ -0,0 +1,9 @@ +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Capabilities --source_data_url https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Lite --source_data_url https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Classic --source_data_url https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Instruct --source_data_url https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_MMLU --source_data_url https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json \ No newline at end of file diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py index 3297cfac9..acb5330d9 100644 --- a/utils/helm/adapter.py +++ b/utils/helm/adapter.py @@ -22,10 +22,12 @@ EvaluationLog, EvaluationResult, EvaluatorRelationship, + GenerationConfig, MetricConfig, ModelInfo, ScoreDetails, ScoreType, + SourceDataUrl ) import sys @@ -114,7 +116,7 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T else: spec = run_spec_names[0] args = spec.split(":", 1)[1].split(",") - + model_details = next( (arg.split("=", 1)[1] for arg in args if arg.startswith("model=")), "", @@ -126,12 +128,14 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T if developer == "unknown": developer = get_developer(model_name) - return make_model_info( + model_info = make_model_info( model_name=model_name, developer=developer, inference_platform="unknown", - ), model_id + ) + model_info.id = model_id + return model_info def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): """Determine min/max values for each metric column.""" @@ -152,7 +156,6 @@ def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): def convert( leaderboard_name: str, leaderboard_data: List[Dict[str, Any]], - source_data: List[str], ): """Convert HELM leaderboard data into unified evaluation logs.""" retrieved_timestamp = str(time.time()) @@ -172,9 +175,9 @@ def convert( model_name = row[0].get("value") if model_name not in model_infos: - model_info, model_id = extract_model_info_from_row(row, model_name) + model_info = extract_model_info_from_row(row, model_name) model_infos[model_name] = model_info - model_ids[model_name] = model_id + model_ids[model_name] = model_info.id for col_idx, (header, cell) in enumerate(zip(headers[1:], row[1:])): full_eval_name = header.get("value") @@ -203,6 +206,17 @@ def convert( score_type=ScoreType.continuous, ) + if full_eval_name.lower().startswith('mean'): + dataset_name = leaderboard_name + else: + dataset_name = full_eval_name.split(' - ')[0] + + source_data = SourceDataUrl( + dataset_name=dataset_name, + source_type='url', + url=[args.source_data_url] + ) + generation_config = ( extract_generation_config(cell.get("run_spec_names", [])) if cell.get("run_spec_names") @@ -211,6 +225,7 @@ def convert( model_results[model_name][short_name] = EvaluationResult( evaluation_name=full_eval_name, + source_data=source_data, metric_config=metric_config, score_details=ScoreDetails( score=round(cell.get("value"), 3) @@ -221,7 +236,9 @@ def convert( "tab": tab_name, }, ), - generation_config=generation_config, + generation_config=GenerationConfig( + additional_details=generation_config + ) ) else: # Add extra score details under the same metric @@ -232,12 +249,16 @@ def convert( else f"{full_eval_name} - {tab_name}" ) - existing.score_details.details[detail_key] = { - "description": cell.get("description"), - "tab": tab_name, - "score": cell.get("value"), - } - + setattr( + existing.score_details.details, + detail_key, + { + "description": cell.get("description"), + "tab": tab_name, + "score": cell.get("value"), + } + ) + # Save evaluation logs for model_name, results_by_metric in model_results.items(): model_info = model_infos[model_name] @@ -250,7 +271,7 @@ def convert( ) eval_log = EvaluationLog( - schema_version="0.1.0", + schema_version="0.2.0", evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, source_metadata=make_source_metadata( @@ -259,7 +280,6 @@ def convert( evaluator_relationship=EvaluatorRelationship.third_party, ), model_info=model_info, - source_data=source_data, evaluation_results=list(results_by_metric.values()), ) @@ -287,15 +307,13 @@ def convert( args = parse_args() leaderboard_name = args.leaderboard_name.lower() - source_data = [args.source_data_url] print(f"Fetching {leaderboard_name} data from {args.source_data_url}") - leaderboard_data = fetch_json(source_data[0]) + leaderboard_data = fetch_json(args.source_data_url) convert( leaderboard_name=leaderboard_name, - leaderboard_data=leaderboard_data, - source_data=source_data, + leaderboard_data=leaderboard_data ) print("Done!") From b77590228e88dde571bf32017cdf923e787a8c1b Mon Sep 17 00:00:00 2001 From: Damian Stachura Date: Wed, 11 Feb 2026 19:53:47 +0100 Subject: [PATCH 2/2] Fix naming conventions --- ...8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json} | 24 +- ...7d2d1dba-1b31-47b2-8308-f2668cf36c99.json} | 24 +- ...3a056f7b-1bdf-4543-9e67-1101ace67179.json} | 24 +- ...275cf2e5-5ccd-40be-be55-938c82ef6688.json} | 24 +- ...43e7be99-4872-4eb1-b30b-75c44b298ab4.json} | 24 +- ...cfc99298-4570-48cf-9187-aa0d167cc0ba.json} | 24 +- ...a2162367-d16d-4274-aa89-43435cea5c0b.json} | 24 +- ...51ef4580-da13-415a-a37f-45e2036ed4c2.json} | 24 +- ...3fa605db-fcff-4f05-9398-6af77c9dcada.json} | 24 +- ...9d58ac39-fef7-47c8-920a-8be2069f5662.json} | 24 +- ...dd9b10af-ad39-45ef-8f91-097340d376c7.json} | 24 +- ...30a6de14-c57c-483e-92e9-26fc4c7f4772.json} | 24 +- ...bed1a799-77a6-40a1-9f37-d54fe9d4d055.json} | 24 +- ...6c226cad-23f1-4c09-8038-eb7b776cdee4.json} | 24 +- ...98887061-09d6-44ba-9cff-0267045a26ef.json} | 24 +- ...6693f0e2-3514-413d-be61-d10f7372b3dc.json} | 24 +- ...ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json} | 24 +- ...0d9a856d-01bf-4a82-9872-33d561cf4a57.json} | 24 +- ...3ff2ab7d-2c0f-4313-8223-8f514fde595a.json} | 24 +- ...2a46e8da-1996-428c-b567-cd0287b29d9f.json} | 24 +- ...30a92593-398e-4c2f-8be7-455be166aeaf.json} | 24 +- ...e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json} | 24 +- ...dfc2717d-ead8-4287-885e-5e0fc09c35e3.json} | 24 +- ...e97292eb-7031-4a3a-a415-44c137898e3f.json} | 24 +- ...4263a6be-9640-40a1-8881-768624949d47.json} | 24 +- ...a808cecf-8925-428f-99ea-b6c2f8bce96e.json} | 24 +- ...55e44a3b-1fac-4ad5-b25e-85702f33883d.json} | 24 +- ...5b5b339b-7631-4b77-ac51-df49d3e946eb.json} | 24 +- ...eaec6d66-6da7-4592-baca-2539240acc5d.json} | 24 +- ...2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json} | 24 +- ...eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json} | 24 +- ...75b5943a-67be-4b2f-85da-a52533edc76f.json} | 24 +- ...8bec35b7-271a-457d-b665-9f69baa248aa.json} | 24 +- ...c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json} | 24 +- ...c308b0a5-4c44-4369-9b23-8664959aa927.json} | 24 +- ...1a1edfb2-f0f1-4930-82c0-99293ec76645.json} | 24 +- ...9aa5af51-8c55-4896-b634-162a9d82b58e.json} | 24 +- ...21461a52-2f25-48c9-be19-f9233317d817.json} | 24 +- ...bdea0967-fcc7-493c-a18d-70727842deb9.json} | 24 +- ...f7404ea3-62c7-47fc-9106-44c208470381.json} | 24 +- ...2817820c-4b28-4235-a8fd-ad02d0f504bc.json} | 24 +- ...f3da71fc-fc88-4dda-b423-168d11eab317.json} | 24 +- ...2f7c0db9-b5de-4674-a130-5315520dea68.json} | 24 +- ...4dcb8022-fe54-42f7-b43f-9866de173731.json} | 24 +- ...c436f3d1-84ee-49df-9287-0305925f7cf4.json} | 24 +- ...90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json} | 24 +- ...07c823ba-9e17-47e4-858b-a1f2a514a276.json} | 24 +- ...eb1bb443-71ad-4b79-8308-2b66c5e8c631.json} | 24 +- ...e14d42a9-9639-4c35-8a0c-e395e754c46c.json} | 24 +- ...3754df44-ddce-4a66-9074-f65f5677ae27.json} | 24 +- ...a540b282-e9d6-403e-96df-a1d27ad14d3a.json} | 24 +- ...758851b3-9ac9-43d8-8b6a-3d9688752d80.json} | 24 +- ...1d9ac688-ca0d-405b-a262-e95673e79250.json} | 24 +- ...c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json} | 24 +- ...35a31e19-2ef5-4caa-a848-422af42adab8.json} | 24 +- ...7de0bda2-ce56-444a-b293-a310a5b2d7ab.json} | 24 +- ...dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json} | 24 +- ...9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json} | 24 +- ...07763926-3a19-43f9-a23f-095f6cb78799.json} | 24 +- ...56e024b3-c963-4172-9f52-7605276b3854.json} | 24 +- ...6f660e47-1d86-473d-9864-208111dcea31.json} | 24 +- ...91ef1f96-a708-4c53-ac9d-208ef3420668.json} | 24 +- ...c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json} | 24 +- ...505c6245-88d1-4557-9e34-63a4e8086210.json} | 24 +- ...9a473236-f187-4926-ae8a-e8b84fe2a060.json} | 24 +- ...1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json} | 24 +- ...aeabfb59-74db-445c-9693-7a088ac5073c.json} | 24 +- ...eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json} | 24 +- ...12fdea65-94eb-4c85-876c-65f0528bde12.json} | 60 ++--- ...d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json} | 60 ++--- ...1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json} | 60 ++--- ...deddbc80-70ac-43e7-b052-753d127f8390.json} | 60 ++--- ...e4780862-bf3c-4856-b1e7-02616afe931a.json} | 60 ++--- ...cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json} | 60 ++--- ...13a22d40-f274-4384-adcc-1539da821c6a.json} | 60 ++--- ...a01f642e-730b-461d-8afe-9c077ab3f149.json} | 60 ++--- ...813802a3-483e-443d-9e49-7cd581b5ea6d.json} | 60 ++--- ...90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json} | 60 ++--- ...d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json} | 60 ++--- ...3dc29785-a884-4496-a6f4-a8bf19892e50.json} | 60 ++--- ...ff8dc291-bbaf-4149-854e-e1780b0c86d5.json} | 60 ++--- ...b8932181-b669-4b0e-8879-1dfbf9afea12.json} | 60 ++--- ...c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json} | 60 ++--- ...579fb908-3c36-4ff8-a262-fd5388806b83.json} | 60 ++--- ...68ff9f10-0357-4ea8-b758-de6c7f51d669.json} | 60 ++--- ...b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json} | 60 ++--- ...8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json} | 60 ++--- ...8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json} | 60 ++--- ...6bbe052f-46f7-4541-80a3-dbb86433db7a.json} | 60 ++--- ...9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json} | 60 ++--- ...742a59e8-c813-42ef-938a-4897e25dcdad.json} | 60 ++--- ...5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json} | 60 ++--- ...509360bc-86f5-49dc-899c-2899d8b6bc6c.json} | 60 ++--- ...8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json} | 60 ++--- ...8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json} | 60 ++--- ...7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json} | 60 ++--- ...d65d8f48-8b8e-4ec6-af68-f61af5408adf.json} | 60 ++--- ...dff69882-cb8b-4323-b587-60f295085459.json} | 60 ++--- ...90220411-5e4d-4b74-a74c-ca2ad030d50e.json} | 60 ++--- ...8c2465b2-deca-476c-bb41-836685ceab35.json} | 60 ++--- ...4b0f6a03-1054-4047-82d1-53992f0378ee.json} | 60 ++--- ...78bc128a-6e53-4086-9498-2b3428e1d884.json} | 60 ++--- ...2be7887e-6c91-437c-bbfc-8b68de3330da.json} | 60 ++--- ...f135ce21-655f-4ebf-9cc6-d83ada0f177b.json} | 60 ++--- ...48912a61-af54-4208-b36d-2f3a283e5c5d.json} | 60 ++--- ...cc85315f-4472-4b22-9f0a-e4609676ce13.json} | 60 ++--- ...ab773619-db5e-449b-8d6b-da743cb038bb.json} | 60 ++--- ...5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json} | 60 ++--- ...32cc2aa3-be26-41bd-8124-a8b1073c84c4.json} | 60 ++--- ...42a86a4a-7e76-4c7d-af48-e765a38df589.json} | 60 ++--- ...f9746ed1-887f-4850-ac2d-700de18acbaf.json} | 60 ++--- ...899521d0-e5eb-4e1b-af5a-78b3bd32e232.json} | 60 ++--- ...1fb2c6db-2495-4609-a96b-57815c579953.json} | 60 ++--- ...a5b6cc8b-676d-4c19-8093-0b893937e3d4.json} | 60 ++--- ...0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json} | 60 ++--- ...bc207557-fb49-4a87-8401-22c3ce853e7c.json} | 60 ++--- ...895266ee-71a5-4ca5-b3f9-62df6383ff95.json} | 60 ++--- ...8828e9e8-5716-41b4-a2d1-233bb056dc32.json} | 60 ++--- ...f267ba72-b239-4126-99c5-675f79b1ae95.json} | 60 ++--- ...f386e763-8078-454b-bd14-32b106663d53.json} | 60 ++--- ...a4739cda-028b-48e0-b3b5-ca9b583d03f5.json} | 60 ++--- ...837e20ff-fed1-4431-b643-63b904055c66.json} | 60 ++--- ...e411f017-22c6-4d49-9bf9-5d99c1091791.json} | 60 ++--- ...7bd2b266-5a65-4c63-bf18-5e4114564bfc.json} | 60 ++--- ...49a1423e-d5f4-4665-b81e-d491f492a316.json} | 60 ++--- ...8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json} | 60 ++--- ...ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json} | 60 ++--- ...a2b4ed40-b04f-481f-986b-25a2c26bbb79.json} | 60 ++--- ...e88f9163-5334-43ed-9b56-154bf543f898.json} | 60 ++--- ...6d436bd5-9d49-4895-8c07-7814b2eef12c.json} | 60 ++--- ...681d0d6d-de06-4b8e-a7e2-964d98e2806e.json} | 60 ++--- ...e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json} | 60 ++--- ...cb80bd5f-204a-4dd8-96ec-40c7df93975f.json} | 60 ++--- ...f84f84a8-7191-42ac-8951-5d7141a0f700.json} | 60 ++--- ...9ba74767-b675-460a-bb68-e82adb6acd2f.json} | 60 ++--- ...e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json} | 28 +-- ...60724488-914d-4efe-98d6-f3ff26fe8fbc.json} | 28 +-- ...2aaae404-b510-41e0-9a4a-b2d053731454.json} | 28 +-- ...053badb4-b50a-434a-909c-c4d939c00b4e.json} | 28 +-- ...7b4a4c6d-e302-4010-a099-5b01c874ffe8.json} | 40 ++-- ...db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json} | 40 ++-- ...f6808908-79d9-4de5-8434-94e4bdb854f2.json} | 40 ++-- ...1a039ef6-5957-4246-82b2-bc607b6554e7.json} | 40 ++-- ...fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json} | 40 ++-- ...0e2790d3-40f1-4124-ba41-b65bd9de1852.json} | 40 ++-- ...d55129d3-4eae-4009-a897-fa1624cea6a2.json} | 40 ++-- ...6332f0b3-7fab-41ed-a8da-46b142051377.json} | 40 ++-- ...0cb33741-ca10-40f5-90d3-28e300901ad3.json} | 40 ++-- ...80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json} | 40 ++-- ...de41775f-f60e-481e-a8ef-3df9a9b65a5a.json} | 40 ++-- ...bc29d5c6-b5c8-473b-b69c-054026829089.json} | 40 ++-- ...ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json} | 40 ++-- ...4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json} | 40 ++-- ...9ef56d5a-de00-4d89-930c-a4c74211dd78.json} | 40 ++-- ...5598d3ed-5b37-4aec-b186-0b16c394633b.json} | 40 ++-- ...a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json} | 40 ++-- ...54bac699-aa82-4133-8c10-c6510c2a7f95.json} | 40 ++-- ...79b23601-3148-4256-88ce-67e439a87c5b.json} | 40 ++-- ...e92648e4-75c6-4944-9ec1-880823fefc87.json} | 40 ++-- ...449feffd-d2e3-4a08-ad69-b8ad522532ae.json} | 40 ++-- ...d297b253-0f4f-4caf-864b-9f457ab589da.json} | 40 ++-- ...d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json} | 40 ++-- ...cb409208-034d-42fd-acce-ab5cc4227383.json} | 40 ++-- ...b2572ef8-446a-45b4-b557-45736418753b.json} | 40 ++-- ...70d85516-b710-4b27-b664-03a6a822773b.json} | 40 ++-- ...a8208df4-eb37-47d2-8845-f821e80e9858.json} | 40 ++-- ...22cde248-40ab-43b0-a408-6d8b84692f22.json} | 40 ++-- ...b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json} | 40 ++-- ...ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json} | 40 ++-- ...8721a15b-9102-4b1a-bde8-e5371f00f1b5.json} | 40 ++-- ...23b3a30c-8aa3-4684-be54-adae003720fc.json} | 40 ++-- ...7022c444-d6b8-4374-be0c-14835e5fd281.json} | 40 ++-- ...bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json} | 40 ++-- ...bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json} | 40 ++-- ...527418d0-2591-43c9-b639-17328292b110.json} | 40 ++-- ...8ddc465f-4f2d-4213-81c4-70b584d48047.json} | 40 ++-- ...eca63d17-7fc2-4722-8bb3-0be99a257100.json} | 40 ++-- ...e40a10b3-e682-4715-b2ee-4efcae050a58.json} | 40 ++-- ...56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json} | 40 ++-- ...f47ca10d-cd45-485e-b9cf-0c6592d63656.json} | 40 ++-- ...7f0e318e-31bf-4044-bffb-357c1238d4fd.json} | 40 ++-- ...818d6d72-0b5c-4fcf-b808-1d186223301e.json} | 40 ++-- ...f09b853b-dbbc-4252-a0f0-a2c45c29f670.json} | 40 ++-- ...f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json} | 40 ++-- ...83c6a723-87a0-43d4-968e-86d186578e9e.json} | 40 ++-- ...daaf221b-1759-4619-91fb-938e81975787.json} | 40 ++-- ...6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json} | 40 ++-- ...1043b815-b247-4444-bf8c-0b92b793c57f.json} | 40 ++-- ...28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json} | 40 ++-- ...73dedd31-7d40-4ee6-994d-00eb7d656597.json} | 40 ++-- ...18da1dfa-5366-477b-a9cf-af29c5a99b68.json} | 40 ++-- ...80057cc1-45ab-4976-878e-be963eaa83b1.json} | 40 ++-- ...d896249f-bbd9-4657-a5db-5968544cb5fa.json} | 40 ++-- ...9f73f3e5-b573-45d4-8c98-82f5c496f786.json} | 40 ++-- ...a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json} | 40 ++-- ...4ff688da-61a0-43ce-9c2d-e1c197887683.json} | 40 ++-- ...181003ea-7587-4c93-8b89-c5c76958313d.json} | 40 ++-- ...66688228-e59a-4caa-b3fb-c5df1efc9db4.json} | 40 ++-- ...2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json} | 40 ++-- ...077fe37f-b3a4-483a-93a5-034c6445fe98.json} | 40 ++-- ...4fbb173c-b900-4e11-87bd-1ac6a489d014.json} | 40 ++-- ...e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json} | 40 ++-- ...0925f9b7-08f8-485f-84bc-a153a54aa417.json} | 40 ++-- ...08082277-8305-4007-97cd-88202fc0115c.json} | 40 ++-- ...fe554cbd-2480-40bd-b2f5-464cad700c14.json} | 40 ++-- ...9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json} | 40 ++-- ...d9654997-1d3e-41c3-9f16-05a36dde9b02.json} | 40 ++-- ...73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json} | 40 ++-- ...4d01d929-b5e2-42dc-89ee-20560f560db5.json} | 40 ++-- ...76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json} | 40 ++-- ...69ea0ef0-c136-4cff-9607-6ae12e0692c3.json} | 40 ++-- ...bbe708f3-fb78-49e9-876d-cae57f1231cc.json} | 40 ++-- ...ab7b7951-0792-4538-8a7a-6baee8602cbb.json} | 40 ++-- ...fc94c95d-9678-4f23-b82f-190a08ece307.json} | 40 ++-- ...3f92e2fc-9831-4c2c-b94e-af33d457fa82.json} | 40 ++-- ...3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json} | 40 ++-- ...6b2891bd-2444-4286-8ccf-c91181856d29.json} | 40 ++-- ...bd924bd3-e13c-48e0-b339-8c15c5072038.json} | 40 ++-- ...b8a6f32a-9904-43bb-9add-89404093a9db.json} | 40 ++-- ...c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json} | 40 ++-- ...9c1fc50a-437d-458b-926c-33cabdcc4aeb.json} | 40 ++-- ...5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json} | 40 ++-- ...10e1abfa-83de-4960-8d4c-c5099894cb80.json} | 40 ++-- ...40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json} | 40 ++-- ...2abf3bb8-a78f-4a59-807e-52da4e6426fd.json} | 40 ++-- ...ae28615a-b7fa-4782-89e1-4b8e4804dc62.json} | 40 ++-- ...52bb6ab9-e80b-4bf0-a375-7706f16d311d.json} | 40 ++-- ...fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json} | 40 ++-- ...1158720a-9a0e-492e-a677-9b0936f4cde5.json} | 40 ++-- ...254ded81-4051-420d-b402-2e7b80a23848.json} | 40 ++-- ...ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json} | 214 +++++++++--------- ...7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json} | 214 +++++++++--------- ...5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json} | 214 +++++++++--------- ...0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json} | 214 +++++++++--------- ...92e0b1b9-c167-4e07-b770-2b78527eb4eb.json} | 214 +++++++++--------- ...3da06ad4-0770-45f5-a6a2-9ef9500cef05.json} | 214 +++++++++--------- ...c1c79360-60bd-4f5d-a746-e0411b94f69b.json} | 214 +++++++++--------- ...bb904716-048c-4b41-9f64-4d17c485afe3.json} | 214 +++++++++--------- ...063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json} | 214 +++++++++--------- ...c8949c55-8987-4ed3-b74b-8b13b4381806.json} | 214 +++++++++--------- ...ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json} | 214 +++++++++--------- ...bc9cedd7-5cb2-44b2-abda-470322570e14.json} | 214 +++++++++--------- ...305a7f25-6e22-4146-9678-6a687a701567.json} | 214 +++++++++--------- ...c6059976-85a1-40ce-b02f-67e182aa2f7d.json} | 214 +++++++++--------- ...6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json} | 214 +++++++++--------- ...f397ca7a-41c4-4926-b075-2523639f0a50.json} | 214 +++++++++--------- ...acdf4701-e1c2-4867-bd85-d34ae8fb0991.json} | 214 +++++++++--------- ...3cd855af-9679-4fd0-bc3f-34db697c7855.json} | 214 +++++++++--------- ...78fb6814-e32f-4b15-b958-9e001637ba07.json} | 214 +++++++++--------- ...f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json} | 214 +++++++++--------- ...cefc3b25-0779-4fb3-93a5-3c7a285304af.json} | 214 +++++++++--------- ...7e00e082-0e79-45e0-b0ff-5458cc2aff85.json} | 214 +++++++++--------- ...ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json} | 214 +++++++++--------- ...c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json} | 214 +++++++++--------- ...7ea5b404-d98f-4282-81d8-6ca5f6629429.json} | 214 +++++++++--------- ...7056c7e7-f68a-4764-aa48-a8368ae2e317.json} | 214 +++++++++--------- ...5e67014d-6ca1-4e65-a85a-84d91e147d4d.json} | 214 +++++++++--------- ...3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json} | 214 +++++++++--------- ...46d5e547-507e-4c98-98a9-bad1bfad7f7b.json} | 214 +++++++++--------- ...ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json} | 214 +++++++++--------- ...2b31b441-caa9-465c-a2d2-051c951c7be3.json} | 214 +++++++++--------- ...b7ea6c93-af70-4c0f-ba50-03a539416a8b.json} | 214 +++++++++--------- ...fe4cec30-e483-49a8-80ea-00b2c6231740.json} | 214 +++++++++--------- ...53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json} | 214 +++++++++--------- ...af88b02d-cb29-4d2c-bb33-5fddcf316a95.json} | 214 +++++++++--------- ...a0abcd19-58a1-478a-9786-d044a4181241.json} | 214 +++++++++--------- ...95eda13a-cd34-4170-b2db-f2ead47250f9.json} | 214 +++++++++--------- ...7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json} | 214 +++++++++--------- ...9da7439c-e96b-444f-b4fa-7ef638080740.json} | 214 +++++++++--------- ...294b22a0-1676-4d8c-8ad2-5cdc40267255.json} | 214 +++++++++--------- ...1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json} | 214 +++++++++--------- ...78f2484e-bc73-4026-929b-db345e92cf5a.json} | 214 +++++++++--------- ...8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json} | 214 +++++++++--------- ...41af381a-3637-4578-a582-59d9b1327d95.json} | 214 +++++++++--------- ...96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json} | 214 +++++++++--------- ...bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json} | 214 +++++++++--------- ...e036de72-b425-4aa5-9448-dc52560e60db.json} | 214 +++++++++--------- ...65423181-18f1-4296-98c2-171356106404.json} | 214 +++++++++--------- ...41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json} | 214 +++++++++--------- ...f78d6e0a-a397-4a41-a37e-696bda5a1987.json} | 214 +++++++++--------- ...d2bf70ce-341f-49d7-bd03-87b523826953.json} | 214 +++++++++--------- ...b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json} | 214 +++++++++--------- ...08590b6e-7050-413d-844b-1f3f1c5aa444.json} | 214 +++++++++--------- ...2d18fd88-73b5-4d4c-a1cc-e66a20316605.json} | 214 +++++++++--------- ...567918be-be6f-4e41-b613-727828fe8a44.json} | 214 +++++++++--------- ...c2be131b-808c-4947-b24f-69ef6af499d7.json} | 214 +++++++++--------- ...24955250-a2e9-475f-a866-30a835579e03.json} | 214 +++++++++--------- ...de6f7e19-b54a-4bd3-b624-29f66afbee15.json} | 214 +++++++++--------- ...e4c3032d-04e0-414b-a7e9-e30756d82000.json} | 214 +++++++++--------- ...e9a41d4b-56c7-47f0-b439-72ad1e463000.json} | 214 +++++++++--------- ...a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json} | 214 +++++++++--------- ...fd6aea24-dc18-41ce-bc19-23f461a39032.json} | 214 +++++++++--------- ...625d33ce-a320-4bfd-a962-451b8c22d392.json} | 214 +++++++++--------- ...e51be257-610e-4d38-b58a-a3b29fc06a83.json} | 214 +++++++++--------- ...9e0b9f48-f913-4bbe-a135-59e596c9e479.json} | 214 +++++++++--------- ...189e6cc5-1c8f-4712-8dda-c108f18f836d.json} | 214 +++++++++--------- ...4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json} | 214 +++++++++--------- ...ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json} | 214 +++++++++--------- ...fa6a6772-671b-402e-9480-d61e0fb4a61e.json} | 214 +++++++++--------- ...b5279e94-ae7f-4671-9315-874e162a24fd.json} | 214 +++++++++--------- ...de00e8da-9c83-40df-b642-b94719ce1ac2.json} | 214 +++++++++--------- ...119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json} | 214 +++++++++--------- ...80aabdf4-60b7-493b-98d8-1854f1c41c10.json} | 214 +++++++++--------- ...29958cee-32c9-4d51-8f14-72db4273459f.json} | 214 +++++++++--------- ...72537b16-feda-4e5e-a477-f415650db847.json} | 214 +++++++++--------- ...7df68af5-667a-4125-9c12-e71fb5af0a74.json} | 214 +++++++++--------- ...1845eb8b-4c94-4d22-8771-012f7230dc62.json} | 214 +++++++++--------- ...b2c8cfd1-f09a-4616-8038-c7e1930bce74.json} | 214 +++++++++--------- ...12976629-cefe-4329-b974-bb17f88d385d.json} | 214 +++++++++--------- utils/helm/adapter.py | 24 +- 310 files changed, 13172 insertions(+), 13162 deletions(-) rename data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/{bd982107-7c03-4ee8-8a38-782d68883818.json => 8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json} (92%) rename data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/{25aa6e41-ab16-4f63-9613-bfb83b9151c5.json => 7d2d1dba-1b31-47b2-8308-f2668cf36c99.json} (92%) rename data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/{ddd52881-1248-4652-9f1d-5d2b58ede889.json => 3a056f7b-1bdf-4543-9e67-1101ace67179.json} (92%) rename data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/{365bc693-73b6-41fe-a8fa-eba7b91febe0.json => 275cf2e5-5ccd-40be-be55-938c82ef6688.json} (92%) rename data/helm_capabilities/amazon/nova-lite-v1_0/{a126b881-918a-411a-90e9-32d7b63d1e00.json => 43e7be99-4872-4eb1-b30b-75c44b298ab4.json} (92%) rename data/helm_capabilities/amazon/nova-micro-v1_0/{b8e54bb1-0768-4558-8dc2-4897d4e571aa.json => cfc99298-4570-48cf-9187-aa0d167cc0ba.json} (92%) rename data/helm_capabilities/amazon/nova-premier-v1_0/{a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json => a2162367-d16d-4274-aa89-43435cea5c0b.json} (92%) rename data/helm_capabilities/amazon/nova-pro-v1_0/{2413b504-7125-461b-ae9d-0c58211a5358.json => 51ef4580-da13-415a-a37f-45e2036ed4c2.json} (92%) rename data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/{f350d9d1-b743-4017-bc68-a4dc726515d0.json => 3fa605db-fcff-4f05-9398-6af77c9dcada.json} (92%) rename data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/{c32a1f0a-bf8a-42be-b155-4f87465235bc.json => 9d58ac39-fef7-47c8-920a-8be2069f5662.json} (92%) rename data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/{96cfde1b-77de-4d2a-8b45-938116795108.json => dd9b10af-ad39-45ef-8f91-097340d376c7.json} (92%) rename data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/{56c180e5-45aa-4106-8f92-c6566c3c7dfc.json => 30a6de14-c57c-483e-92e9-26fc4c7f4772.json} (92%) rename data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/{d633fcd6-eb01-49ff-ba7c-6ca12734746f.json => bed1a799-77a6-40a1-9f37-d54fe9d4d055.json} (92%) rename data/helm_capabilities/anthropic/claude-opus-4-20250514/{7a7b49ff-5060-4d12-acb9-607125fbe081.json => 6c226cad-23f1-4c09-8038-eb7b776cdee4.json} (92%) rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/{287a3646-d969-4bd9-9773-86463c1ba87f.json => 98887061-09d6-44ba-9cff-0267045a26ef.json} (92%) rename data/helm_capabilities/anthropic/claude-sonnet-4-20250514/{97f3892f-9588-49ef-abef-3a0c965bb352.json => 6693f0e2-3514-413d-be61-d10f7372b3dc.json} (92%) rename data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/{22ba68b0-6eec-47f2-b465-47f298e8da09.json => ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json} (92%) rename data/helm_capabilities/deepseek-ai/deepseek-r1-0528/{9e5684dc-6380-4353-b966-7205d66340fa.json => 0d9a856d-01bf-4a82-9872-33d561cf4a57.json} (92%) rename data/helm_capabilities/deepseek-ai/deepseek-v3/{1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json => 3ff2ab7d-2c0f-4313-8223-8f514fde595a.json} (92%) rename data/helm_capabilities/google/gemini-1.5-flash-002/{20512a3b-ac0f-483a-8bec-9962980c579c.json => 2a46e8da-1996-428c-b567-cd0287b29d9f.json} (92%) rename data/helm_capabilities/google/gemini-1.5-pro-002/{704c5c74-a0ee-457d-9b4e-3ae895ffc105.json => 30a92593-398e-4c2f-8be7-455be166aeaf.json} (92%) rename data/helm_capabilities/google/gemini-2.0-flash-001/{eb9224b8-0edb-4605-a2ee-cfb63f41370e.json => e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json} (92%) rename data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/{4cb58f80-c2b1-45c6-b781-19af47660eb0.json => dfc2717d-ead8-4287-885e-5e0fc09c35e3.json} (91%) rename data/helm_capabilities/google/gemini-2.5-flash-lite/{6307e0c4-c983-4257-82d8-b2a50171eb8a.json => e97292eb-7031-4a3a-a415-44c137898e3f.json} (92%) rename data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/{275cd615-bddf-4afe-a499-b463fe183486.json => 4263a6be-9640-40a1-8881-768624949d47.json} (92%) rename data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/{03b48360-a387-44ba-94b2-2eb7c234a9fa.json => a808cecf-8925-428f-99ea-b6c2f8bce96e.json} (92%) rename data/helm_capabilities/google/gemini-3-pro-preview/{3a242fb8-07f9-460e-93eb-345aab0f994f.json => 55e44a3b-1fac-4ad5-b25e-85702f33883d.json} (92%) rename data/helm_capabilities/ibm/granite-3.3-8b-instruct/{5e5720d0-67fe-40a9-b65b-d4154848d83c.json => 5b5b339b-7631-4b77-ac51-df49d3e946eb.json} (92%) rename data/helm_capabilities/ibm/granite-4.0-h-small/{9c9239df-0cbb-411f-af40-1b2782f91255.json => eaec6d66-6da7-4592-baca-2539240acc5d.json} (92%) rename data/helm_capabilities/ibm/granite-4.0-micro/{e1d12d96-185f-493e-bb08-8237623fb736.json => 2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json} (92%) rename data/helm_capabilities/marin-community/marin-8b-instruct/{aba1fded-b031-48df-87ef-dc744df33501.json => eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json} (92%) rename data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/{98f69aa6-b227-4076-a76e-1293cbe1c6cb.json => 75b5943a-67be-4b2f-85da-a52533edc76f.json} (92%) rename data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/{d2bb087e-a275-4fce-b6dc-001fd4545883.json => 8bec35b7-271a-457d-b665-9f69baa248aa.json} (92%) rename data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/{84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json => c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json} (92%) rename data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/{23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json => c308b0a5-4c44-4369-9b23-8664959aa927.json} (92%) rename data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/{9cab3a77-4f32-48d0-ba11-e2323ccc4861.json => 1a1edfb2-f0f1-4930-82c0-99293ec76645.json} (92%) rename data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/{9e037c92-1253-49be-b31a-3aa017531d77.json => 9aa5af51-8c55-4896-b634-162a9d82b58e.json} (92%) rename data/helm_capabilities/mistralai/mistral-large-2411/{bd26c7cb-ce76-4b17-b617-d1d93a168c93.json => 21461a52-2f25-48c9-be19-f9233317d817.json} (92%) rename data/helm_capabilities/mistralai/mistral-small-2503/{9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json => bdea0967-fcc7-493c-a18d-70727842deb9.json} (92%) rename data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/{d69a1cbe-353c-4be9-b93b-5224d24c7adf.json => f7404ea3-62c7-47fc-9106-44c208470381.json} (92%) rename data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/{915cb39d-f21f-4ef1-a95f-f44f79ede893.json => 2817820c-4b28-4235-a8fd-ad02d0f504bc.json} (92%) rename data/helm_capabilities/moonshotai/kimi-k2-instruct/{fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json => f3da71fc-fc88-4dda-b423-168d11eab317.json} (92%) rename data/helm_capabilities/openai/gpt-4.1-2025-04-14/{eb51f418-6abf-4b2c-9f57-0b830c00bd15.json => 2f7c0db9-b5de-4674-a130-5315520dea68.json} (92%) rename data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/{41cd14b0-46ba-49da-844a-19fe866bef1e.json => 4dcb8022-fe54-42f7-b43f-9866de173731.json} (92%) rename data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/{7de93642-a4bc-430b-8733-9befeb6a0e23.json => c436f3d1-84ee-49df-9287-0305925f7cf4.json} (92%) rename data/helm_capabilities/openai/gpt-4o-2024-11-20/{4f18292a-1fef-4feb-9b17-045c96e3e137.json => 90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json} (92%) rename data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/{7458c032-b24d-4f13-a659-b6e19d19a8e1.json => 07c823ba-9e17-47e4-858b-a1f2a514a276.json} (92%) rename data/helm_capabilities/openai/gpt-5-2025-08-07/{21eb1648-aad0-4297-9edc-c445e4c38694.json => eb1bb443-71ad-4b79-8308-2b66c5e8c631.json} (92%) rename data/helm_capabilities/openai/gpt-5-mini-2025-08-07/{99d657ae-e850-4caf-a599-13f1b8072273.json => e14d42a9-9639-4c35-8a0c-e395e754c46c.json} (92%) rename data/helm_capabilities/openai/gpt-5-nano-2025-08-07/{10cd766e-442c-4b3d-833b-740417d9a6d9.json => 3754df44-ddce-4a66-9074-f65f5677ae27.json} (92%) rename data/helm_capabilities/openai/gpt-5.1-2025-11-13/{bc6124a7-89df-4c3e-b824-56c948d1eeb5.json => a540b282-e9d6-403e-96df-a1d27ad14d3a.json} (92%) rename data/helm_capabilities/openai/gpt-oss-120b/{06719cd4-5654-49b6-9dee-e112d1601d1c.json => 758851b3-9ac9-43d8-8b6a-3d9688752d80.json} (92%) rename data/helm_capabilities/openai/gpt-oss-20b/{ed849999-48c2-4569-8bcd-dc73084e3197.json => 1d9ac688-ca0d-405b-a262-e95673e79250.json} (91%) rename data/helm_capabilities/openai/o3-2025-04-16/{01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json => c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json} (92%) rename data/helm_capabilities/openai/o4-mini-2025-04-16/{32382d69-21c7-43a9-bb95-27607ec18cc9.json => 35a31e19-2ef5-4caa-a848-422af42adab8.json} (92%) rename data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/{77e702f7-37ef-4487-b047-74b13ef6d966.json => 7de0bda2-ce56-444a-b293-a310a5b2d7ab.json} (92%) rename data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/{4ee3c647-740c-41a6-ac66-4a38b09317ff.json => dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json} (92%) rename data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/{ca30726a-00a6-4228-94fe-5dce00de1d5e.json => 9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json} (92%) rename data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/{7862890a-298b-4bda-b8f1-7be6a5779365.json => 07763926-3a19-43f9-a23f-095f6cb78799.json} (92%) rename data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/{8c73a09f-ba0d-4c12-a12a-776a17292151.json => 56e024b3-c963-4172-9f52-7605276b3854.json} (92%) rename data/helm_capabilities/writer/palmyra-fin/{442aed0d-95c3-4436-ad63-b7b1e93307f4.json => 6f660e47-1d86-473d-9864-208111dcea31.json} (91%) rename data/helm_capabilities/writer/palmyra-med/{7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json => 91ef1f96-a708-4c53-ac9d-208ef3420668.json} (91%) rename data/helm_capabilities/writer/palmyra-x-004/{bc2c91e0-6afd-4e44-b665-d5c7558f8981.json => c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json} (92%) rename data/helm_capabilities/writer/palmyra-x5/{a74b74f7-ccce-4341-a122-26728cc6bece.json => 505c6245-88d1-4557-9e34-63a4e8086210.json} (91%) rename data/helm_capabilities/xai/grok-3-beta/{87811b75-afe8-413b-949d-7fd1f582a2e8.json => 9a473236-f187-4926-ae8a-e8b84fe2a060.json} (91%) rename data/helm_capabilities/xai/grok-3-mini-beta/{ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json => 1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json} (92%) rename data/helm_capabilities/xai/grok-4-0709/{924080a0-c530-4e6d-b1a4-107de3bd7183.json => aeabfb59-74db-445c-9693-7a088ac5073c.json} (91%) rename data/helm_capabilities/zai-org/glm-4.5-air-fp8/{be23c720-a99a-4945-bc0b-ddc27c8eec39.json => eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json} (92%) rename data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/{425d4a41-2def-4581-9b61-ee33ecb3a822.json => 12fdea65-94eb-4c85-876c-65f0528bde12.json} (91%) rename data/helm_classic/ai21/J1-Grande-v1-17B/{c12a8494-bafc-4097-874a-7c00636e96f8.json => d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json} (92%) rename data/helm_classic/ai21/J1-Grande-v2-beta-17B/{4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json => 1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json} (92%) rename data/helm_classic/ai21/J1-Jumbo-v1-178B/{19f61327-fcc3-408f-9254-2d6a2aadcd4e.json => deddbc80-70ac-43e7-b052-753d127f8390.json} (92%) rename data/helm_classic/ai21/J1-Large-v1-7.5B/{ccc17d56-bd26-409c-ac3f-262eaba9ce21.json => e4780862-bf3c-4856-b1e7-02616afe931a.json} (92%) rename data/helm_classic/ai21/Jurassic-2-Grande-17B/{f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json => cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json} (92%) rename data/helm_classic/ai21/Jurassic-2-Jumbo-178B/{9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json => 13a22d40-f274-4384-adcc-1539da821c6a.json} (92%) rename data/helm_classic/ai21/Jurassic-2-Large-7.5B/{f25c142c-8730-4241-a649-01d076e1f28d.json => a01f642e-730b-461d-8afe-9c077ab3f149.json} (91%) rename data/helm_classic/aleph-alpha/Luminous-Base-13B/{ab34f23e-36db-40c0-9681-f30b00692f98.json => 813802a3-483e-443d-9e49-7cd581b5ea6d.json} (91%) rename data/helm_classic/aleph-alpha/Luminous-Extended-30B/{67281534-a03d-49d8-a586-25cb1a03134e.json => 90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json} (91%) rename data/helm_classic/aleph-alpha/Luminous-Supreme-70B/{3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json => d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json} (91%) rename data/helm_classic/bigscience/BLOOM-176B/{04ce2ba4-c382-4658-ba06-1def9499a243.json => 3dc29785-a884-4496-a6f4-a8bf19892e50.json} (91%) rename data/helm_classic/bigscience/T0pp-11B/{3a546396-d031-4958-8410-00e0d3406089.json => ff8dc291-bbaf-4149-854e-e1780b0c86d5.json} (93%) rename data/helm_classic/cohere/Cohere-Command-beta-52.4B/{e7b99aa6-08e8-4224-a805-16586eb44325.json => b8932181-b669-4b0e-8879-1dfbf9afea12.json} (92%) rename data/helm_classic/cohere/Cohere-Command-beta-6.1B/{43a3fe19-929a-463d-a0ed-791dad765188.json => c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json} (92%) rename data/helm_classic/cohere/Cohere-large-v20220720-13.1B/{75468958-b75b-41fe-9813-070b793e86d9.json => 579fb908-3c36-4ff8-a262-fd5388806b83.json} (92%) rename data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/{6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json => 68ff9f10-0357-4ea8-b758-de6c7f51d669.json} (92%) rename data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/{3c9c425a-ce4a-4958-9744-7f9490ed5729.json => b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json} (92%) rename data/helm_classic/cohere/Cohere-small-v20220720-410M/{5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json => 8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json} (92%) rename data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/{8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json => 8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json} (92%) rename data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/{f8044c74-3f1c-4562-a21c-e448061b2077.json => 6bbe052f-46f7-4541-80a3-dbb86433db7a.json} (92%) rename data/helm_classic/eleutherai/Pythia-12B/{4abe3a0d-ba04-41f7-b107-59f11ff5697a.json => 9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json} (91%) rename data/helm_classic/eleutherai/Pythia-6.9B/{646adb7b-0761-4639-8776-83ea158bfca4.json => 742a59e8-c813-42ef-938a-4897e25dcdad.json} (91%) rename data/helm_classic/google/Palmyra-X-43B/{85cf6be2-d066-4e1b-b373-d53d3c922184.json => 5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json} (91%) rename data/helm_classic/google/T5-11B/{52db5c6d-b54e-401a-880d-8ab41a394bc4.json => 509360bc-86f5-49dc-899c-2899d8b6bc6c.json} (91%) rename data/helm_classic/google/UL2-20B/{68becad6-9455-4d3d-8d68-d1b4448598a1.json => 8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json} (91%) rename data/helm_classic/lmsys/Vicuna-v1.3-13B/{519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json => 8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json} (91%) rename data/helm_classic/lmsys/Vicuna-v1.3-7B/{972bc5db-f536-42f9-aa51-83cc2f59b76a.json => 7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json} (91%) rename data/helm_classic/meta/LLaMA-13B/{b2220101-56e0-49d9-a3d1-d3bec769ab97.json => d65d8f48-8b8e-4ec6-af68-f61af5408adf.json} (91%) rename data/helm_classic/meta/LLaMA-30B/{96907b25-05c3-441b-afc4-69274c20bfc3.json => dff69882-cb8b-4323-b587-60f295085459.json} (91%) rename data/helm_classic/meta/LLaMA-65B/{66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json => 90220411-5e4d-4b74-a74c-ca2ad030d50e.json} (91%) rename data/helm_classic/meta/LLaMA-7B/{70e9e156-6807-489b-b77a-367236614826.json => 8c2465b2-deca-476c-bb41-836685ceab35.json} (91%) rename data/helm_classic/meta/Llama-2-13B/{e90cfb46-1173-4d22-9329-9bf57cdd5241.json => 4b0f6a03-1054-4047-82d1-53992f0378ee.json} (91%) rename data/helm_classic/meta/Llama-2-70B/{baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json => 78bc128a-6e53-4086-9498-2b3428e1d884.json} (91%) rename data/helm_classic/meta/Llama-2-7B/{7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json => 2be7887e-6c91-437c-bbfc-8b68de3330da.json} (91%) rename data/helm_classic/meta/OPT-175B/{ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json => f135ce21-655f-4ebf-9cc6-d83ada0f177b.json} (92%) rename data/helm_classic/meta/OPT-66B/{26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json => 48912a61-af54-4208-b36d-2f3a283e5c5d.json} (92%) rename data/helm_classic/microsoft/TNLG-v2-530B/{ecd21c26-cdc4-43b1-b933-4d970df9413a.json => cc85315f-4472-4b22-9f0a-e4609676ce13.json} (91%) rename data/helm_classic/microsoft/TNLG-v2-6.7B/{9d4350eb-cdf0-432f-b3b0-45f4832ca950.json => ab773619-db5e-449b-8d6b-da743cb038bb.json} (91%) rename data/helm_classic/mistralai/Mistral-v0.1-7B/{3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json => 5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json} (91%) rename data/helm_classic/mosaicml/MPT-30B/{b277c87e-54b5-466f-97d7-35db4cd7b985.json => 32cc2aa3-be26-41bd-8124-a8b1073c84c4.json} (91%) rename data/helm_classic/mosaicml/MPT-Instruct-30B/{270df23b-9e58-4259-a8ed-0d25b9c80b2a.json => 42a86a4a-7e76-4c7d-af48-e765a38df589.json} (91%) rename data/helm_classic/openai/GPT-J-6B/{1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json => f9746ed1-887f-4850-ac2d-700de18acbaf.json} (92%) rename data/helm_classic/openai/GPT-NeoX-20B/{ef171b67-72a6-46d3-9eaf-4614ff474852.json => 899521d0-e5eb-4e1b-af5a-78b3bd32e232.json} (92%) rename data/helm_classic/openai/ada-350M/{e6ea5f7e-0533-4a99-8638-1cc10c454238.json => 1fb2c6db-2495-4609-a96b-57815c579953.json} (94%) rename data/helm_classic/openai/babbage-1.3B/{83c924fe-6318-4bad-adb0-8a81e5e28ee0.json => a5b6cc8b-676d-4c19-8093-0b893937e3d4.json} (94%) rename data/helm_classic/openai/curie-6.7B/{82e2c0e3-66f2-431f-b4b8-d2495970d998.json => 0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json} (94%) rename data/helm_classic/openai/davinci-175B/{6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json => bc207557-fb49-4a87-8401-22c3ce853e7c.json} (94%) rename data/helm_classic/openai/gpt-3.5-turbo-0301/{e18fbf9e-677c-49fb-ab76-475e8f605f01.json => 895266ee-71a5-4ca5-b3f9-62df6383ff95.json} (91%) rename data/helm_classic/openai/gpt-3.5-turbo-0613/{039af363-0c5c-4e36-8396-cd57c7e4c1de.json => 8828e9e8-5716-41b4-a2d1-233bb056dc32.json} (91%) rename data/helm_classic/openai/text-ada-001/{8ea1facb-260a-461d-9271-2c07b318c46f.json => f267ba72-b239-4126-99c5-675f79b1ae95.json} (94%) rename data/helm_classic/openai/text-babbage-001/{93007ac9-04c2-451d-abd2-2f235297747e.json => f386e763-8078-454b-bd14-32b106663d53.json} (94%) rename data/helm_classic/openai/text-curie-001/{b04e5f90-e46e-4d7a-a6a9-569bde072208.json => a4739cda-028b-48e0-b3b5-ca9b583d03f5.json} (94%) rename data/helm_classic/openai/text-davinci-002/{933dc76f-45f0-48e0-93ae-3e19cff87c2a.json => 837e20ff-fed1-4431-b643-63b904055c66.json} (94%) rename data/helm_classic/openai/text-davinci-003/{b8408a64-eb89-4337-8ee5-3c48e4e24437.json => e411f017-22c6-4d49-9bf9-5d99c1091791.json} (94%) rename data/helm_classic/stanford/Alpaca-7B/{d5846321-0800-4ff9-b85c-53c8b4884ba5.json => 7bd2b266-5a65-4c63-bf18-5e4114564bfc.json} (91%) rename data/helm_classic/tiiuae/Falcon-40B/{baa5f92c-b626-4e09-a084-61ce7f5dee98.json => 49a1423e-d5f4-4665-b81e-d491f492a316.json} (91%) rename data/helm_classic/tiiuae/Falcon-7B/{9b648e90-8d3c-403d-9ad8-382ef0b212a6.json => 8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json} (91%) rename data/helm_classic/tiiuae/Falcon-Instruct-40B/{0692f762-337e-4c20-8ad6-feecc93882a3.json => ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json} (91%) rename data/helm_classic/tiiuae/Falcon-Instruct-7B/{a91c9563-0756-4616-8a58-3c8000f73895.json => a2b4ed40-b04f-481f-986b-25a2c26bbb79.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Base-7B/{3a329574-dcf6-4177-b37c-c495e6af6cc5.json => e88f9163-5334-43ed-9b56-154bf543f898.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/{9e662c1e-e77c-4fb3-b589-127683a4b2ca.json => 6d436bd5-9d49-4895-8c07-7814b2eef12c.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Instruct-7B/{375140f6-bd3f-4b55-a35c-23de37254296.json => 681d0d6d-de06-4b8e-a7e2-964d98e2806e.json} (91%) rename data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/{021d0b25-8f58-47da-a58c-ac532a7972bf.json => e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json} (91%) rename data/helm_classic/writer/InstructPalmyra-30B/{9207fec4-d0c4-4f66-b917-f5ed57409215.json => cb80bd5f-204a-4dd8-96ec-40c7df93975f.json} (91%) rename data/helm_classic/yandex/YaLM-100B/{b04c8845-cccf-4856-9597-ab283bb2ec8d.json => f84f84a8-7191-42ac-8951-5d7141a0f700.json} (91%) rename data/helm_classic/zhipu-ai/GLM-130B/{4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json => 9ba74767-b675-460a-bb68-e82adb6acd2f.json} (91%) rename data/helm_instruct/anthropic/claude-v1.3/{0e30e895-aaf7-42d4-95db-7541d6b41c87.json => e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json} (73%) rename data/helm_instruct/cohere/command-xlarge-beta/{4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json => 60724488-914d-4efe-98d6-f3ff26fe8fbc.json} (74%) rename data/helm_instruct/openai/gpt-3.5-turbo-0613/{8befd29c-a16d-4e05-a92f-00b621d45e03.json => 2aaae404-b510-41e0-9a4a-b2d053731454.json} (74%) rename data/helm_instruct/openai/gpt-4-0314/{b2e193b8-215b-4e80-9d5a-df11f1dac88a.json => 053badb4-b50a-434a-909c-c4d939c00b4e.json} (73%) rename data/helm_lite/01-ai/yi-34b/{eedd0f38-6d26-4297-a469-291227ec6be6.json => 7b4a4c6d-e302-4010-a099-5b01c874ffe8.json} (85%) rename data/helm_lite/01-ai/yi-6b/{74c47665-740f-4784-8a27-1c1d1c29bff8.json => db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json} (85%) rename data/helm_lite/01-ai/yi-large-preview/{8027b577-7f48-4df5-9879-bd45ac342f42.json => f6808908-79d9-4de5-8434-94e4bdb854f2.json} (85%) rename data/helm_lite/AlephAlpha/luminous-base/{e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json => 1a039ef6-5957-4246-82b2-bc607b6554e7.json} (85%) rename data/helm_lite/AlephAlpha/luminous-extended/{24e11e7b-15d6-4a09-9545-38486d0eb236.json => fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json} (86%) rename data/helm_lite/AlephAlpha/luminous-supreme/{eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json => 0e2790d3-40f1-4124-ba41-b65bd9de1852.json} (85%) rename data/helm_lite/ai21/j2-grande/{52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json => d55129d3-4eae-4009-a897-fa1624cea6a2.json} (85%) rename data/helm_lite/ai21/j2-jumbo/{68713712-ae92-474b-84c0-1b8301538439.json => 6332f0b3-7fab-41ed-a8da-46b142051377.json} (85%) rename data/helm_lite/ai21/jamba-1.5-large/{15cc9411-6ea4-4f10-831f-23ff27fd5704.json => 0cb33741-ca10-40f5-90d3-28e300901ad3.json} (85%) rename data/helm_lite/ai21/jamba-1.5-mini/{3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json => 80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json} (85%) rename data/helm_lite/ai21/jamba-instruct/{1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json => de41775f-f60e-481e-a8ef-3df9a9b65a5a.json} (85%) rename data/helm_lite/allenai/olmo-7b/{078d812b-2198-4497-8fbe-06fb640fd86d.json => bc29d5c6-b5c8-473b-b69c-054026829089.json} (85%) rename data/helm_lite/amazon/nova-lite-v1_0/{f928a53d-9d67-45e7-a871-04359c8162d5.json => ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json} (85%) rename data/helm_lite/amazon/nova-micro-v1_0/{741c4560-eb35-4edf-a48b-af29e743740a.json => 4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json} (85%) rename data/helm_lite/amazon/nova-pro-v1_0/{4e8a8384-5f1d-4b76-be9b-385407332d6c.json => 9ef56d5a-de00-4d89-930c-a4c74211dd78.json} (85%) rename data/helm_lite/anthropic/claude-2.0/{0684c1d2-ea43-4341-820c-09051f5e11f2.json => 5598d3ed-5b37-4aec-b186-0b16c394633b.json} (85%) rename data/helm_lite/anthropic/claude-2.1/{51821ca1-7eac-4094-abac-98b2484cc5a0.json => a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json} (85%) rename data/helm_lite/anthropic/claude-3-5-haiku-20241022/{8a0f5749-7f6a-4813-9c08-7283433c1337.json => 54bac699-aa82-4133-8c10-c6510c2a7f95.json} (86%) rename data/helm_lite/anthropic/claude-3-5-sonnet-20240620/{4697983d-a29a-484d-9268-7974117456e8.json => 79b23601-3148-4256-88ce-67e439a87c5b.json} (86%) rename data/helm_lite/anthropic/claude-3-5-sonnet-20241022/{60e33aa3-0593-42e6-9baa-8311746deca0.json => e92648e4-75c6-4944-9ec1-880823fefc87.json} (86%) rename data/helm_lite/anthropic/claude-3-haiku-20240307/{2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json => 449feffd-d2e3-4a08-ad69-b8ad522532ae.json} (86%) rename data/helm_lite/anthropic/claude-3-opus-20240229/{9ad91ee2-7a64-4f94-9166-f2681777023b.json => d297b253-0f4f-4caf-864b-9f457ab589da.json} (86%) rename data/helm_lite/anthropic/claude-3-sonnet-20240229/{4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json => d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json} (86%) rename data/helm_lite/anthropic/claude-instant-1.2/{64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json => cb409208-034d-42fd-acce-ab5cc4227383.json} (86%) rename data/helm_lite/anthropic/claude-v1.3/{fe8a36b0-4361-461b-b310-656c54131fa6.json => b2572ef8-446a-45b4-b557-45736418753b.json} (85%) rename data/helm_lite/cohere/command-light/{b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json => 70d85516-b710-4b27-b664-03a6a822773b.json} (85%) rename data/helm_lite/cohere/command-r-plus/{67967a2a-5fb4-46e8-b1ec-eda1588d9086.json => a8208df4-eb37-47d2-8845-f821e80e9858.json} (85%) rename data/helm_lite/cohere/command-r/{0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json => 22cde248-40ab-43b0-a408-6d8b84692f22.json} (85%) rename data/helm_lite/cohere/command/{ba5eea81-2120-4a20-8322-dfbd29cd197c.json => b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json} (85%) rename data/helm_lite/databricks/dbrx-instruct/{9dd66ede-da5c-4627-92ed-7057c9a2bea3.json => ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json} (85%) rename data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/{801aa7da-90b2-48d1-ad3d-943b06bd437c.json => 8721a15b-9102-4b1a-bde8-e5371f00f1b5.json} (85%) rename data/helm_lite/deepseek-ai/deepseek-v3/{a58923ea-fa22-4c45-8327-efbe84c8a05d.json => 23b3a30c-8aa3-4684-be54-adae003720fc.json} (85%) rename data/helm_lite/google/gemini-1.0-pro-002/{bab8d241-fad0-4230-b213-c2eeccc79f12.json => 7022c444-d6b8-4374-be0c-14835e5fd281.json} (85%) rename data/helm_lite/google/gemini-1.5-flash-001/{65e37589-ef26-46cd-a627-798af70e75bf.json => bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json} (85%) rename data/helm_lite/google/gemini-1.5-flash-002/{f499f9c6-4c9a-43ba-b4c3-d094494a371c.json => bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json} (85%) rename data/helm_lite/google/gemini-1.5-pro-001/{27a54446-57b2-4239-b768-7ab85dc94c54.json => 527418d0-2591-43c9-b639-17328292b110.json} (85%) rename data/helm_lite/google/gemini-1.5-pro-002/{5de8a13e-a029-4a90-9a2d-c28a59212140.json => 8ddc465f-4f2d-4213-81c4-70b584d48047.json} (85%) rename data/helm_lite/google/gemini-2.0-flash-exp/{f9643ce2-7347-401b-903e-fadcc5221f36.json => eca63d17-7fc2-4722-8bb3-0be99a257100.json} (85%) rename data/helm_lite/google/gemma-2-27b-it/{9932e430-2039-40b0-bc8f-ae2d833543e8.json => e40a10b3-e682-4715-b2ee-4efcae050a58.json} (85%) rename data/helm_lite/google/gemma-2-9b-it/{dbd2e9bb-c2ca-4165-b229-d736a70721a5.json => 56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json} (85%) rename data/helm_lite/google/gemma-7b/{32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json => f47ca10d-cd45-485e-b9cf-0c6592d63656.json} (85%) rename data/helm_lite/google/text-bison@001/{70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json => 7f0e318e-31bf-4044-bffb-357c1238d4fd.json} (85%) rename data/helm_lite/google/text-unicorn@001/{07a367ee-2879-4ede-bbf8-33b24d682467.json => 818d6d72-0b5c-4fcf-b808-1d186223301e.json} (85%) rename data/helm_lite/meta/llama-2-13b/{fee914c7-d6bf-4d61-9f50-71bae5f11006.json => f09b853b-dbbc-4252-a0f0-a2c45c29f670.json} (85%) rename data/helm_lite/meta/llama-2-70b/{b0577066-231e-461b-bae8-b724b204397a.json => f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json} (85%) rename data/helm_lite/meta/llama-2-7b/{b79fe2e3-5eec-46f8-90a1-810781c8c46a.json => 83c6a723-87a0-43d4-968e-86d186578e9e.json} (85%) rename data/helm_lite/meta/llama-3-70b/{998616ef-5d1b-4c65-b6ad-23afc3630d5a.json => daaf221b-1759-4619-91fb-938e81975787.json} (85%) rename data/helm_lite/meta/llama-3-8b/{fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json => 6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json} (85%) rename data/helm_lite/meta/llama-3.1-405b-instruct-turbo/{25fde5e6-86b8-4a80-8f79-5946ef9999fc.json => 1043b815-b247-4444-bf8c-0b92b793c57f.json} (86%) rename data/helm_lite/meta/llama-3.1-70b-instruct-turbo/{b955825d-ae7f-48c4-9dad-5ee78879737d.json => 28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json} (86%) rename data/helm_lite/meta/llama-3.1-8b-instruct-turbo/{168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json => 73dedd31-7d40-4ee6-994d-00eb7d656597.json} (86%) rename data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/{0807e353-9787-4ca0-8f7b-50d1bed2469e.json => 18da1dfa-5366-477b-a9cf-af29c5a99b68.json} (85%) rename data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/{0164b885-2c27-4eba-8e6f-e69156cb0dee.json => 80057cc1-45ab-4976-878e-be963eaa83b1.json} (85%) rename data/helm_lite/meta/llama-3.3-70b-instruct-turbo/{08422837-51a0-45c9-9004-fc5d98dce462.json => d896249f-bbd9-4657-a5db-5968544cb5fa.json} (86%) rename data/helm_lite/meta/llama-65b/{39f2c7f2-56d4-4349-95ae-374d34263f48.json => 9f73f3e5-b573-45d4-8c98-82f5c496f786.json} (85%) rename data/helm_lite/microsoft/phi-2/{0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json => a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json} (85%) rename data/helm_lite/microsoft/phi-3-medium-4k-instruct/{75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json => 4ff688da-61a0-43ce-9c2d-e1c197887683.json} (86%) rename data/helm_lite/microsoft/phi-3-small-8k-instruct/{2de4b89a-3f3b-4d1d-ba85-030953a46956.json => 181003ea-7587-4c93-8b89-c5c76958313d.json} (85%) rename data/helm_lite/mistralai/mistral-7b-instruct-v0.3/{bd68405f-fe9a-448b-9c80-468c656594e5.json => 66688228-e59a-4caa-b3fb-c5df1efc9db4.json} (86%) rename data/helm_lite/mistralai/mistral-7b-v0.1/{4267fef1-3180-46e3-990e-0d1092ec4c18.json => 2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json} (85%) rename data/helm_lite/mistralai/mistral-large-2402/{002a34dc-39e5-451d-b2a8-b51bdb69a056.json => 077fe37f-b3a4-483a-93a5-034c6445fe98.json} (86%) rename data/helm_lite/mistralai/mistral-large-2407/{5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json => 4fbb173c-b900-4e11-87bd-1ac6a489d014.json} (86%) rename data/helm_lite/mistralai/mistral-medium-2312/{ad2beded-cec3-4b47-b8de-a32a3225fa66.json => e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json} (86%) rename data/helm_lite/mistralai/mistral-small-2402/{eb901347-fc1f-4d8f-a70a-05a83e16658d.json => 0925f9b7-08f8-485f-84bc-a153a54aa417.json} (86%) rename data/helm_lite/mistralai/mixtral-8x22b/{9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json => 08082277-8305-4007-97cd-88202fc0115c.json} (85%) rename data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/{042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json => fe554cbd-2480-40bd-b2f5-464cad700c14.json} (86%) rename data/helm_lite/mistralai/open-mistral-nemo-2407/{d2d48e4a-0484-4f44-8108-2e689d7ca695.json => 9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json} (86%) rename data/helm_lite/openai/gpt-3.5-turbo-0613/{e54ae605-a91d-47d7-a08d-67bd0ea5c606.json => d9654997-1d3e-41c3-9f16-05a36dde9b02.json} (85%) rename data/helm_lite/openai/gpt-4-0613/{15dccf75-871d-457b-8495-e0d03d550360.json => 73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json} (85%) rename data/helm_lite/openai/gpt-4-1106-preview/{18fe5d30-bf36-405a-819e-1ecabda327ea.json => 4d01d929-b5e2-42dc-89ee-20560f560db5.json} (85%) rename data/helm_lite/openai/gpt-4-turbo-2024-04-09/{cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json => 76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json} (86%) rename data/helm_lite/openai/gpt-4o-2024-05-13/{cd199905-04a4-4745-b848-4f7bde97ca17.json => 69ea0ef0-c136-4cff-9607-6ae12e0692c3.json} (85%) rename data/helm_lite/openai/gpt-4o-2024-08-06/{1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json => bbe708f3-fb78-49e9-876d-cae57f1231cc.json} (85%) rename data/helm_lite/openai/gpt-4o-mini-2024-07-18/{bfd70aff-bf45-4f55-b730-4924afc181cd.json => ab7b7951-0792-4538-8a7a-6baee8602cbb.json} (86%) rename data/helm_lite/openai/text-davinci-002/{b6e08679-1bd7-42a1-9eee-98252de2c7c1.json => fc94c95d-9678-4f23-b82f-190a08ece307.json} (85%) rename data/helm_lite/openai/text-davinci-003/{22b411d5-a314-4b17-a9c7-c1af7ca7df33.json => 3f92e2fc-9831-4c2c-b94e-af33d457fa82.json} (85%) rename data/helm_lite/qwen/qwen1.5-110b-chat/{f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json => 3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json} (85%) rename data/helm_lite/qwen/qwen1.5-14b/{fb1bb023-16f6-4914-889b-6458d7ab1277.json => 6b2891bd-2444-4286-8ccf-c91181856d29.json} (85%) rename data/helm_lite/qwen/qwen1.5-32b/{8b572c10-3553-4e51-a321-bdb05996914b.json => bd924bd3-e13c-48e0-b339-8c15c5072038.json} (85%) rename data/helm_lite/qwen/qwen1.5-72b/{6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json => b8a6f32a-9904-43bb-9add-89404093a9db.json} (85%) rename data/helm_lite/qwen/qwen1.5-7b/{e0efe169-d28e-418e-a78c-9b04ec29aae2.json => c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json} (85%) rename data/helm_lite/qwen/qwen2-72b-instruct/{05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json => 9c1fc50a-437d-458b-926c-33cabdcc4aeb.json} (85%) rename data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/{983696ae-d7f3-48a4-b7a0-a42487728182.json => 5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json} (86%) rename data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/{a969e516-adef-4839-9252-244c58ab3c67.json => 10e1abfa-83de-4960-8d4c-c5099894cb80.json} (86%) rename data/helm_lite/snowflake/snowflake-arctic-instruct/{f122f9de-b1ce-40ea-8731-6c00c7af0498.json => 40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json} (86%) rename data/helm_lite/tiiuae/falcon-40b/{5c7982c5-3513-4ff2-9857-33a0db825376.json => 2abf3bb8-a78f-4a59-807e-52da4e6426fd.json} (85%) rename data/helm_lite/tiiuae/falcon-7b/{4910859a-750c-4728-bf30-309e0e81690e.json => ae28615a-b7fa-4782-89e1-4b8e4804dc62.json} (85%) rename data/helm_lite/upstage/solar-pro-241126/{32f0532f-b504-492d-84d7-f541930edad0.json => 52bb6ab9-e80b-4bf0-a375-7706f16d311d.json} (85%) rename data/helm_lite/writer/palmyra-x-004/{04c187a3-4532-4523-b39d-19314d61c779.json => fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json} (85%) rename data/helm_lite/writer/palmyra-x-v2/{4440532c-9b49-4c9a-8bf4-f122531c54fa.json => 1158720a-9a0e-492e-a677-9b0936f4cde5.json} (85%) rename data/helm_lite/writer/palmyra-x-v3/{bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json => 254ded81-4051-420d-b402-2e7b80a23848.json} (85%) rename data/helm_mmlu/01-ai/yi-34b/{3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json => ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json} (90%) rename data/helm_mmlu/01-ai/yi-6b/{6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json => 7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json} (90%) rename data/helm_mmlu/01-ai/yi-large-preview/{3d0b3d68-a853-4989-a35e-83ac6722c2da.json => 5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json} (90%) rename data/helm_mmlu/ai21/jamba-1.5-large/{ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json => 0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json} (90%) rename data/helm_mmlu/ai21/jamba-1.5-mini/{517e8027-6edd-482b-86f3-33b6c41a9609.json => 92e0b1b9-c167-4e07-b770-2b78527eb4eb.json} (90%) rename data/helm_mmlu/ai21/jamba-instruct/{f7c1c125-ad0f-4847-b880-4f705f1666c6.json => 3da06ad4-0770-45f5-a6a2-9ef9500cef05.json} (90%) rename data/helm_mmlu/allenai/olmo-1.7-7b/{5a0ba280-8a12-4735-9d92-4ed71ba395b4.json => c1c79360-60bd-4f5d-a746-e0411b94f69b.json} (90%) rename data/helm_mmlu/allenai/olmo-7b/{73ccc6a6-e10d-4619-914f-26032cddf8da.json => bb904716-048c-4b41-9f64-4d17c485afe3.json} (90%) rename data/helm_mmlu/amazon/nova-lite-v1_0/{20c5af59-ff73-4731-9230-f92bb86e657b.json => 063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json} (90%) rename data/helm_mmlu/amazon/nova-micro-v1_0/{fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json => c8949c55-8987-4ed3-b74b-8b13b4381806.json} (90%) rename data/helm_mmlu/amazon/nova-pro-v1_0/{d30617fc-8d64-4070-b86a-c982025cfcea.json => ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json} (90%) rename data/helm_mmlu/anthropic/claude-2.1/{aa8cae95-cb75-4241-951c-25e2046042dd.json => bc9cedd7-5cb2-44b2-abda-470322570e14.json} (90%) rename data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/{c88e4a03-22ae-4338-bf5f-36070814136a.json => 305a7f25-6e22-4146-9678-6a687a701567.json} (90%) rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/{4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json => c6059976-85a1-40ce-b02f-67e182aa2f7d.json} (90%) rename data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/{ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json => 6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json} (90%) rename data/helm_mmlu/anthropic/claude-3-haiku-20240307/{097a8da1-f411-4359-8440-2ab06f4ae76c.json => f397ca7a-41c4-4926-b075-2523639f0a50.json} (90%) rename data/helm_mmlu/anthropic/claude-3-opus-20240229/{68130abd-1df5-4cd3-919a-2863e9f013c7.json => acdf4701-e1c2-4867-bd85-d34ae8fb0991.json} (90%) rename data/helm_mmlu/anthropic/claude-3-sonnet-20240229/{5d8d795a-d213-4b96-9b17-ad5fae6b3687.json => 3cd855af-9679-4fd0-bc3f-34db697c7855.json} (90%) rename data/helm_mmlu/anthropic/claude-instant-1.2/{7908da03-f030-4c62-a121-c04bd94ea75e.json => 78fb6814-e32f-4b15-b958-9e001637ba07.json} (90%) rename data/helm_mmlu/cohere/command-r-plus/{c6fdbf96-2500-4410-8fcd-268ea3e16062.json => f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json} (90%) rename data/helm_mmlu/cohere/command-r/{537164c3-7b88-4543-b19d-370f55a25a66.json => cefc3b25-0779-4fb3-93a5-3c7a285304af.json} (90%) rename data/helm_mmlu/databricks/dbrx-instruct/{0c539e26-8403-42db-acfc-7953dd80ae20.json => 7e00e082-0e79-45e0-b0ff-5458cc2aff85.json} (90%) rename data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/{364c7490-8bb1-4e7e-b485-fb3c2224da58.json => ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json} (90%) rename data/helm_mmlu/deepseek-ai/deepseek-v3/{1a9167d2-882c-4582-b4e0-ac425896a317.json => c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json} (90%) rename data/helm_mmlu/google/gemini-1.0-pro-001/{8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json => 7ea5b404-d98f-4282-81d8-6ca5f6629429.json} (90%) rename data/helm_mmlu/google/gemini-1.5-flash-001/{d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json => 7056c7e7-f68a-4764-aa48-a8368ae2e317.json} (90%) rename data/helm_mmlu/google/gemini-1.5-flash-002/{a94c9e13-dca7-4e02-a795-09d9274354d3.json => 5e67014d-6ca1-4e65-a85a-84d91e147d4d.json} (90%) rename data/helm_mmlu/google/gemini-1.5-flash-preview-0514/{75c8b20f-a4d4-4699-be79-f027bf7f0d69.json => 3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json} (90%) rename data/helm_mmlu/google/gemini-1.5-pro-001/{264be7b4-08b7-40b6-a5e7-f3536f361450.json => 46d5e547-507e-4c98-98a9-bad1bfad7f7b.json} (90%) rename data/helm_mmlu/google/gemini-1.5-pro-002/{83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json => ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json} (90%) rename data/helm_mmlu/google/gemini-1.5-pro-preview-0409/{8a013eb3-0f21-4a50-8a53-4ba977951130.json => 2b31b441-caa9-465c-a2d2-051c951c7be3.json} (90%) rename data/helm_mmlu/google/gemini-2.0-flash-exp/{7b081a40-7cb6-4405-b842-3db95f290dfa.json => b7ea6c93-af70-4c0f-ba50-03a539416a8b.json} (90%) rename data/helm_mmlu/google/gemma-2-27b/{54185b53-9891-43c6-8f93-09ff02b728d8.json => fe4cec30-e483-49a8-80ea-00b2c6231740.json} (90%) rename data/helm_mmlu/google/gemma-2-9b/{884c194d-6519-4bd4-8add-6514e593c514.json => 53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json} (90%) rename data/helm_mmlu/google/gemma-7b/{a80cbd76-bcf8-4174-b0b3-346fae152bdb.json => af88b02d-cb29-4d2c-bb33-5fddcf316a95.json} (90%) rename data/helm_mmlu/google/text-bison@001/{5f105986-aa7d-4858-91bc-cece9d0085ba.json => a0abcd19-58a1-478a-9786-d044a4181241.json} (90%) rename data/helm_mmlu/google/text-unicorn@001/{528b7b4e-c8a6-4387-bd98-497a3316029d.json => 95eda13a-cd34-4170-b2db-f2ead47250f9.json} (90%) rename data/helm_mmlu/meta/llama-2-13b/{96eb34db-66bd-4945-8b4c-a8c1394fe56a.json => 7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json} (90%) rename data/helm_mmlu/meta/llama-2-70b/{961e917b-0e67-462c-b9d0-0fe4b4b85beb.json => 9da7439c-e96b-444f-b4fa-7ef638080740.json} (90%) rename data/helm_mmlu/meta/llama-2-7b/{59a85d2c-16ce-4ed4-bc65-f6898127fa57.json => 294b22a0-1676-4d8c-8ad2-5cdc40267255.json} (90%) rename data/helm_mmlu/meta/llama-3-70b/{16a8b446-51fc-4c23-9231-46ee16c1c0a8.json => 1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json} (90%) rename data/helm_mmlu/meta/llama-3-8b/{f4de7e58-7060-440b-8f6f-1f79d7499d1e.json => 78f2484e-bc73-4026-929b-db345e92cf5a.json} (90%) rename data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/{5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json => 8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json} (90%) rename data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/{dc6aa933-67e4-4811-b3e2-e5200c002abe.json => 41af381a-3637-4578-a582-59d9b1327d95.json} (90%) rename data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/{5f9758a3-fd6d-4598-930a-9c01420d05e8.json => 96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json} (90%) rename data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/{7592c0d8-a06c-4189-81a1-dbf794d22c8b.json => bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json} (90%) rename data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/{83c0e8e3-087c-4d61-9153-e571b4971871.json => e036de72-b425-4aa5-9448-dc52560e60db.json} (90%) rename data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/{c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json => 65423181-18f1-4296-98c2-171356106404.json} (90%) rename data/helm_mmlu/microsoft/phi-2/{5baac093-babb-41cd-a2f4-985d0b91be37.json => 41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json} (90%) rename data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/{1bf54088-ba12-45b4-8f80-63d5c38f58f6.json => f78d6e0a-a397-4a41-a37e-696bda5a1987.json} (90%) rename data/helm_mmlu/microsoft/phi-3-small-8k-instruct/{5ed0a970-200f-4f23-9623-e714afa49ddf.json => d2bf70ce-341f-49d7-bd03-87b523826953.json} (90%) rename data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/{e7fd06a6-65e5-4f88-8e86-c513f78e31db.json => b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json} (90%) rename data/helm_mmlu/mistralai/mistral-7b-v0.1/{ac047aef-008f-4c87-a6d5-4f331ebf5c53.json => 08590b6e-7050-413d-844b-1f3f1c5aa444.json} (90%) rename data/helm_mmlu/mistralai/mistral-large-2402/{ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json => 2d18fd88-73b5-4d4c-a1cc-e66a20316605.json} (90%) rename data/helm_mmlu/mistralai/mistral-large-2407/{7517b6c9-c613-416c-aadb-39fd6d252da7.json => 567918be-be6f-4e41-b613-727828fe8a44.json} (90%) rename data/helm_mmlu/mistralai/mistral-small-2402/{85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json => c2be131b-808c-4947-b24f-69ef6af499d7.json} (90%) rename data/helm_mmlu/mistralai/mixtral-8x22b/{df568c3c-8a5c-4455-836d-c980d7f5ea5c.json => 24955250-a2e9-475f-a866-30a835579e03.json} (90%) rename data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/{96e24977-ca6d-402c-bfd8-62be4cd9b902.json => de6f7e19-b54a-4bd3-b624-29f66afbee15.json} (90%) rename data/helm_mmlu/mistralai/open-mistral-nemo-2407/{e5b2636a-8438-40c0-9f89-9f35585bf740.json => e4c3032d-04e0-414b-a7e9-e30756d82000.json} (90%) rename data/helm_mmlu/openai/gpt-3.5-turbo-0125/{f3259d92-3c95-4b78-81ae-f7f4b80aec63.json => e9a41d4b-56c7-47f0-b439-72ad1e463000.json} (90%) rename data/helm_mmlu/openai/gpt-3.5-turbo-0613/{5ba23a34-4232-487f-b3e9-326d776135be.json => a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json} (90%) rename data/helm_mmlu/openai/gpt-4-0613/{5bc1a462-f753-4259-91c3-a549491b2986.json => fd6aea24-dc18-41ce-bc19-23f461a39032.json} (90%) rename data/helm_mmlu/openai/gpt-4-1106-preview/{16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json => 625d33ce-a320-4bfd-a962-451b8c22d392.json} (90%) rename data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/{dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json => e51be257-610e-4d38-b58a-a3b29fc06a83.json} (90%) rename data/helm_mmlu/openai/gpt-4o-2024-05-13/{2ca11d4c-52e6-49ea-a5cb-238c0313c483.json => 9e0b9f48-f913-4bbe-a135-59e596c9e479.json} (90%) rename data/helm_mmlu/openai/gpt-4o-2024-08-06/{de400624-6c2e-47af-b851-54c4075c30ee.json => 189e6cc5-1c8f-4712-8dda-c108f18f836d.json} (90%) rename data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/{34441b3b-4d66-444c-af85-ca0666a48ed4.json => 4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json} (90%) rename data/helm_mmlu/qwen/qwen1.5-110b-chat/{eecf5e40-9110-47ea-a72b-9ba587b96e30.json => ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json} (90%) rename data/helm_mmlu/qwen/qwen1.5-14b/{f26fb123-c214-4d18-aea8-b05b4ea1819b.json => fa6a6772-671b-402e-9480-d61e0fb4a61e.json} (90%) rename data/helm_mmlu/qwen/qwen1.5-32b/{30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json => b5279e94-ae7f-4671-9315-874e162a24fd.json} (90%) rename data/helm_mmlu/qwen/qwen1.5-72b/{b152cd5c-cbc0-48f4-ba37-16878c3afba1.json => de00e8da-9c83-40df-b642-b94719ce1ac2.json} (90%) rename data/helm_mmlu/qwen/qwen1.5-7b/{dac223e9-3073-46f9-924b-c5a6408f5da9.json => 119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json} (90%) rename data/helm_mmlu/qwen/qwen2-72b-instruct/{a7a218ff-7afe-417c-ac39-cf305d592d56.json => 80aabdf4-60b7-493b-98d8-1854f1c41c10.json} (90%) rename data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/{2e165735-43b8-4317-9cde-35aa4b5bcb26.json => 29958cee-32c9-4d51-8f14-72db4273459f.json} (90%) rename data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/{15c25bc5-7b1e-4771-bda2-fd04d74e1463.json => 72537b16-feda-4e5e-a477-f415650db847.json} (90%) rename data/helm_mmlu/snowflake/snowflake-arctic-instruct/{26036c7c-e981-46e8-b5e9-dcd7d116af70.json => 7df68af5-667a-4125-9c12-e71fb5af0a74.json} (90%) rename data/helm_mmlu/upstage/solar-pro-241126/{b3269e4e-98a7-4795-8ef3-fc87774a54b7.json => 1845eb8b-4c94-4d22-8771-012f7230dc62.json} (90%) rename data/helm_mmlu/writer/palmyra-x-004/{284fde9f-8570-4e6d-9190-e52d8723fe57.json => b2c8cfd1-f09a-4616-8038-c7e1930bce74.json} (90%) rename data/helm_mmlu/writer/palmyra-x-v3/{fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json => 12976629-cefe-4329-b974-bb17f88d385d.json} (90%) diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json similarity index 92% rename from data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json rename to data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json index 28c2132cc..8176fa91a 100644 --- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/bd982107-7c03-4ee8-8a38-782d68883818.json +++ b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,7 +171,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -180,7 +180,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -230,7 +230,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -239,7 +239,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -290,7 +290,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -299,7 +299,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json similarity index 92% rename from data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json rename to data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json index c2c0ac804..4d2b264af 100644 --- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/25aa6e41-ab16-4f63-9613-bfb83b9151c5.json +++ b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,7 +171,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -180,7 +180,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -230,7 +230,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -239,7 +239,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -290,7 +290,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -299,7 +299,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json similarity index 92% rename from data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json rename to data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json index cbc2ce18e..39fbc0d1c 100644 --- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/ddd52881-1248-4652-9f1d-5d2b58ede889.json +++ b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,7 +171,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -180,7 +180,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -230,7 +230,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -239,7 +239,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -290,7 +290,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -299,7 +299,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json similarity index 92% rename from data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json rename to data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json index 4bae095b1..99d31c069 100644 --- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/365bc693-73b6-41fe-a8fa-eba7b91febe0.json +++ b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,7 +171,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -180,7 +180,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -230,7 +230,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -239,7 +239,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -290,7 +290,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -299,7 +299,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json similarity index 92% rename from data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json rename to data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json index f34e2fca2..c786f36c7 100644 --- a/data/helm_capabilities/amazon/nova-lite-v1_0/a126b881-918a-411a-90e9-32d7b63d1e00.json +++ b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json similarity index 92% rename from data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json rename to data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json index da4fca4b9..6219cdf47 100644 --- a/data/helm_capabilities/amazon/nova-micro-v1_0/b8e54bb1-0768-4558-8dc2-4897d4e571aa.json +++ b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json similarity index 92% rename from data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json rename to data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json index 7d306af4a..d9f1bd857 100644 --- a/data/helm_capabilities/amazon/nova-premier-v1_0/a30e7ac5-0cc1-4f7a-acd0-c498dfeb2b4e.json +++ b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json similarity index 92% rename from data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json rename to data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json index 9634c0423..658945ff5 100644 --- a/data/helm_capabilities/amazon/nova-pro-v1_0/2413b504-7125-461b-ae9d-0c58211a5358.json +++ b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json rename to data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json index 59583f434..d63e271d1 100644 --- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/f350d9d1-b743-4017-bc68-a4dc726515d0.json +++ b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json rename to data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json index 050628b1e..c53a3aa66 100644 --- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c32a1f0a-bf8a-42be-b155-4f87465235bc.json +++ b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json rename to data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json index 325dd380e..1f5c52f66 100644 --- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/96cfde1b-77de-4d2a-8b45-938116795108.json +++ b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json rename to data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json index 82dc8fad1..da15e55a7 100644 --- a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/56c180e5-45aa-4106-8f92-c6566c3c7dfc.json +++ b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json rename to data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json index 0e6c52fbd..c554c6a65 100644 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/d633fcd6-eb01-49ff-ba7c-6ca12734746f.json +++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json rename to data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json index 7abaf15ac..240e9ebf4 100644 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/7a7b49ff-5060-4d12-acb9-607125fbe081.json +++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json index f65747fef..ecc6c0f0a 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/287a3646-d969-4bd9-9773-86463c1ba87f.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json index 98193fa4e..b4413ccdd 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/97f3892f-9588-49ef-abef-3a0c965bb352.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json similarity index 92% rename from data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json index 3583acbb0..e0991c0d9 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/22ba68b0-6eec-47f2-b465-47f298e8da09.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json similarity index 92% rename from data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json rename to data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json index 6cc5a7f14..682cc94cc 100644 --- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/9e5684dc-6380-4353-b966-7205d66340fa.json +++ b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json similarity index 92% rename from data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json rename to data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json index 46c4843d4..3b034de70 100644 --- a/data/helm_capabilities/deepseek-ai/deepseek-v3/1cd4ddc5-dd48-456b-b065-a30f34cf4b8e.json +++ b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json similarity index 92% rename from data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json rename to data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json index 26e2e73d6..7d4281de4 100644 --- a/data/helm_capabilities/google/gemini-1.5-flash-002/20512a3b-ac0f-483a-8bec-9962980c579c.json +++ b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json similarity index 92% rename from data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json rename to data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json index 1157dc164..3c438fd59 100644 --- a/data/helm_capabilities/google/gemini-1.5-pro-002/704c5c74-a0ee-457d-9b4e-3ae895ffc105.json +++ b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json similarity index 92% rename from data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json rename to data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json index 68450c9bd..7f589b967 100644 --- a/data/helm_capabilities/google/gemini-2.0-flash-001/eb9224b8-0edb-4605-a2ee-cfb63f41370e.json +++ b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json similarity index 91% rename from data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json rename to data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json index 1bc6a5842..0376cdf40 100644 --- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/4cb58f80-c2b1-45c6-b781-19af47660eb0.json +++ b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json similarity index 92% rename from data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json rename to data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json index f9f820a96..600681fbb 100644 --- a/data/helm_capabilities/google/gemini-2.5-flash-lite/6307e0c4-c983-4257-82d8-b2a50171eb8a.json +++ b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json similarity index 92% rename from data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json rename to data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json index 7f7987a29..221dc7a91 100644 --- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/275cd615-bddf-4afe-a499-b463fe183486.json +++ b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json similarity index 92% rename from data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json rename to data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json index c845227fa..355cd3bc1 100644 --- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/03b48360-a387-44ba-94b2-2eb7c234a9fa.json +++ b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json similarity index 92% rename from data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json rename to data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json index e4e82cd5b..d3ecb3ebb 100644 --- a/data/helm_capabilities/google/gemini-3-pro-preview/3a242fb8-07f9-460e-93eb-345aab0f994f.json +++ b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json similarity index 92% rename from data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json rename to data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json index 828363b5a..869902b9d 100644 --- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5e5720d0-67fe-40a9-b65b-d4154848d83c.json +++ b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json similarity index 92% rename from data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json rename to data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json index 8203eb4c6..03bc0f0f8 100644 --- a/data/helm_capabilities/ibm/granite-4.0-h-small/9c9239df-0cbb-411f-af40-1b2782f91255.json +++ b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json similarity index 92% rename from data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json rename to data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json index bfe399026..399dbb1e3 100644 --- a/data/helm_capabilities/ibm/granite-4.0-micro/e1d12d96-185f-493e-bb08-8237623fb736.json +++ b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json similarity index 92% rename from data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json rename to data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json index 215be80f3..736686c13 100644 --- a/data/helm_capabilities/marin-community/marin-8b-instruct/aba1fded-b031-48df-87ef-dc744df33501.json +++ b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,7 +171,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -180,7 +180,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -230,7 +230,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -239,7 +239,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -290,7 +290,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -299,7 +299,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json similarity index 92% rename from data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json rename to data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json index 41fd4d1af..4dd5465a5 100644 --- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/98f69aa6-b227-4076-a76e-1293cbe1c6cb.json +++ b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json similarity index 92% rename from data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json rename to data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json index 7e6e617b7..407242cbb 100644 --- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/d2bb087e-a275-4fce-b6dc-001fd4545883.json +++ b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json similarity index 92% rename from data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json rename to data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json index 0c2bb79e7..30524d64b 100644 --- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/84fe5ce1-51f2-433a-9047-03ccb3bc3ec5.json +++ b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json similarity index 92% rename from data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json rename to data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json index 71c8e88c3..d9ca75120 100644 --- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/23c87ba9-8f95-47f3-b57e-7d22059a0ee4.json +++ b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json similarity index 92% rename from data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json rename to data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json index 35aef174b..640472423 100644 --- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/9cab3a77-4f32-48d0-ba11-e2323ccc4861.json +++ b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json similarity index 92% rename from data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json rename to data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json index ee064ad73..0b19a4ab4 100644 --- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9e037c92-1253-49be-b31a-3aa017531d77.json +++ b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json similarity index 92% rename from data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json rename to data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json index f4fd3ec06..dec52ca8a 100644 --- a/data/helm_capabilities/mistralai/mistral-large-2411/bd26c7cb-ce76-4b17-b617-d1d93a168c93.json +++ b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json similarity index 92% rename from data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json rename to data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json index ff90f0105..7999b823d 100644 --- a/data/helm_capabilities/mistralai/mistral-small-2503/9f4ca62a-b31b-44c9-bbc3-066905eccbc2.json +++ b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json similarity index 92% rename from data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json rename to data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json index 703963331..583f7956f 100644 --- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/d69a1cbe-353c-4be9-b93b-5224d24c7adf.json +++ b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json similarity index 92% rename from data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json rename to data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json index c522fd879..d2c9cfb4e 100644 --- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/915cb39d-f21f-4ef1-a95f-f44f79ede893.json +++ b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json similarity index 92% rename from data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json rename to data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json index b69be21a9..1946db617 100644 --- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/fc29e8ce-7842-4f33-a70a-81fb194e0ab9.json +++ b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json similarity index 92% rename from data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json rename to data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json index 17443bc6f..3c36cb01b 100644 --- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/eb51f418-6abf-4b2c-9f57-0b830c00bd15.json +++ b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json similarity index 92% rename from data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json rename to data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json index 0342d7835..dd4503511 100644 --- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/41cd14b0-46ba-49da-844a-19fe866bef1e.json +++ b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json similarity index 92% rename from data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json rename to data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json index 15a7d0356..e2550958a 100644 --- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/7de93642-a4bc-430b-8733-9befeb6a0e23.json +++ b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json similarity index 92% rename from data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json rename to data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json index ed5380bd3..3c3d40256 100644 --- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/4f18292a-1fef-4feb-9b17-045c96e3e137.json +++ b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json similarity index 92% rename from data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json rename to data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json index e38c0ac88..778449e6e 100644 --- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7458c032-b24d-4f13-a659-b6e19d19a8e1.json +++ b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json similarity index 92% rename from data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json rename to data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json index fb85b633b..95d9762ef 100644 --- a/data/helm_capabilities/openai/gpt-5-2025-08-07/21eb1648-aad0-4297-9edc-c445e4c38694.json +++ b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json similarity index 92% rename from data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json rename to data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json index 3ca436502..5dc165206 100644 --- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/99d657ae-e850-4caf-a599-13f1b8072273.json +++ b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json similarity index 92% rename from data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json rename to data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json index e271e8724..096518c62 100644 --- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/10cd766e-442c-4b3d-833b-740417d9a6d9.json +++ b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json similarity index 92% rename from data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json rename to data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json index 492db1047..738007852 100644 --- a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/bc6124a7-89df-4c3e-b824-56c948d1eeb5.json +++ b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json similarity index 92% rename from data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json rename to data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json index 13795ec21..8642e9954 100644 --- a/data/helm_capabilities/openai/gpt-oss-120b/06719cd4-5654-49b6-9dee-e112d1601d1c.json +++ b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json similarity index 91% rename from data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json rename to data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json index d2f755b28..5112d535f 100644 --- a/data/helm_capabilities/openai/gpt-oss-20b/ed849999-48c2-4569-8bcd-dc73084e3197.json +++ b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json similarity index 92% rename from data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json rename to data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json index 7455567bf..677721448 100644 --- a/data/helm_capabilities/openai/o3-2025-04-16/01bbecf5-1590-42d7-b27e-04b4f1d67bd6.json +++ b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json similarity index 92% rename from data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json rename to data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json index c33228ef1..fd4ae16c5 100644 --- a/data/helm_capabilities/openai/o4-mini-2025-04-16/32382d69-21c7-43a9-bb95-27607ec18cc9.json +++ b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json similarity index 92% rename from data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json rename to data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json index 31467bc1e..50778c699 100644 --- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/77e702f7-37ef-4487-b047-74b13ef6d966.json +++ b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json similarity index 92% rename from data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json rename to data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json index 0ac7225b8..c974f1019 100644 --- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/4ee3c647-740c-41a6-ac66-4a38b09317ff.json +++ b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json similarity index 92% rename from data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json rename to data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json index 1d36e4190..9ded60c84 100644 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/ca30726a-00a6-4228-94fe-5dce00de1d5e.json +++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json similarity index 92% rename from data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json rename to data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json index 04fc2f6cc..0210712c3 100644 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/7862890a-298b-4bda-b8f1-7be6a5779365.json +++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json similarity index 92% rename from data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json rename to data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json index bbcecd669..6ee69548e 100644 --- a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/8c73a09f-ba0d-4c12-a12a-776a17292151.json +++ b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json similarity index 91% rename from data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json rename to data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json index da11997be..b86fc5b45 100644 --- a/data/helm_capabilities/writer/palmyra-fin/442aed0d-95c3-4436-ad63-b7b1e93307f4.json +++ b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json similarity index 91% rename from data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json rename to data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json index 78088b82e..ac68f722a 100644 --- a/data/helm_capabilities/writer/palmyra-med/7d2ad864-a73f-487d-95e9-6ef5eb4e17c0.json +++ b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-med/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/writer_palmyra-med/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json similarity index 92% rename from data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json rename to data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json index b630b3cd0..9398b6319 100644 --- a/data/helm_capabilities/writer/palmyra-x-004/bc2c91e0-6afd-4e44-b665-d5c7558f8981.json +++ b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json similarity index 91% rename from data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json rename to data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json index c212295f1..6d3707107 100644 --- a/data/helm_capabilities/writer/palmyra-x5/a74b74f7-ccce-4341-a122-26728cc6bece.json +++ b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json similarity index 91% rename from data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json rename to data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json index 34f4be43b..54503d043 100644 --- a/data/helm_capabilities/xai/grok-3-beta/87811b75-afe8-413b-949d-7fd1f582a2e8.json +++ b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json similarity index 92% rename from data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json rename to data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json index 31dab978e..a083c0183 100644 --- a/data/helm_capabilities/xai/grok-3-mini-beta/ecf2fca8-3f8f-4012-ab89-48f1fc1bf678.json +++ b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json similarity index 91% rename from data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json rename to data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json index 872c4f1f5..a25562cb1 100644 --- a/data/helm_capabilities/xai/grok-4-0709/924080a0-c530-4e6d-b1a4-107de3bd7183.json +++ b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json similarity index 92% rename from data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json rename to data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json index 4ead0f554..43a98dd63 100644 --- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/be23c720-a99a-4945-bc0b-ddc27c8eec39.json +++ b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770830201.581632", - "retrieved_timestamp": "1770830201.581632", + "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -47,7 +47,7 @@ } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", "source_data": { "dataset_name": "MMLU-Pro", "source_type": "url", @@ -56,7 +56,7 @@ ] }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -108,7 +108,7 @@ } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", "source_data": { "dataset_name": "GPQA", "source_type": "url", @@ -117,7 +117,7 @@ ] }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -169,7 +169,7 @@ } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", "source_data": { "dataset_name": "IFEval", "source_type": "url", @@ -178,7 +178,7 @@ ] }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -226,7 +226,7 @@ } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", "source_data": { "dataset_name": "WildBench", "source_type": "url", @@ -235,7 +235,7 @@ ] }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -285,7 +285,7 @@ } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", "source_data": { "dataset_name": "Omni-MATH", "source_type": "url", @@ -294,7 +294,7 @@ ] }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json similarity index 91% rename from data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json rename to data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json index 58e2410df..152223193 100644 --- a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/425d4a41-2def-4581-9b61-ee33ecb3a822.json +++ b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json similarity index 92% rename from data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json rename to data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json index 4805e7ac8..6a9a41b41 100644 --- a/data/helm_classic/ai21/J1-Grande-v1-17B/c12a8494-bafc-4097-874a-7c00636e96f8.json +++ b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json similarity index 92% rename from data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json rename to data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json index e47585440..30c92ab94 100644 --- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/4c7e96e1-4da9-488d-ba3d-3daa33d01a92.json +++ b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json similarity index 92% rename from data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json rename to data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json index bfd78fa42..df8111bcc 100644 --- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/19f61327-fcc3-408f-9254-2d6a2aadcd4e.json +++ b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json similarity index 92% rename from data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json rename to data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json index a1c2d2860..5c8560533 100644 --- a/data/helm_classic/ai21/J1-Large-v1-7.5B/ccc17d56-bd26-409c-ac3f-262eaba9ce21.json +++ b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json similarity index 92% rename from data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json rename to data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json index e9db23ac9..4f288f894 100644 --- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f6918b2c-0a1a-4ea9-b8e3-9a5f5b599356.json +++ b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json similarity index 92% rename from data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json rename to data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json index 38cd07e2a..6d0308b9f 100644 --- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/9ab62321-4a79-4c0a-833a-ada0d2b9f6b7.json +++ b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json similarity index 91% rename from data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json rename to data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json index 589346e15..4278cef81 100644 --- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/f25c142c-8730-4241-a649-01d076e1f28d.json +++ b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json similarity index 91% rename from data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json rename to data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json index 371a206a5..7e02805f7 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/ab34f23e-36db-40c0-9681-f30b00692f98.json +++ b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json similarity index 91% rename from data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json rename to data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json index 715673aae..d6f8fa8ea 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/67281534-a03d-49d8-a586-25cb1a03134e.json +++ b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json similarity index 91% rename from data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json rename to data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json index 5f8731441..5680298fb 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/3ec7d0be-f30d-4cd8-bffe-d80b5deb2396.json +++ b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json similarity index 91% rename from data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json rename to data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json index 04305416d..caffd542e 100644 --- a/data/helm_classic/bigscience/BLOOM-176B/04ce2ba4-c382-4658-ba06-1def9499a243.json +++ b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json similarity index 93% rename from data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json rename to data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json index 1bbeba7ff..400f064d5 100644 --- a/data/helm_classic/bigscience/T0pp-11B/3a546396-d031-4958-8410-00e0d3406089.json +++ b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json similarity index 92% rename from data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json rename to data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json index fadfb62da..25f29c7e2 100644 --- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/e7b99aa6-08e8-4224-a805-16586eb44325.json +++ b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json similarity index 92% rename from data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json rename to data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json index b1c061a45..8f01acff1 100644 --- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/43a3fe19-929a-463d-a0ed-791dad765188.json +++ b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json similarity index 92% rename from data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json rename to data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json index bd838c107..16c06b937 100644 --- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/75468958-b75b-41fe-9813-070b793e86d9.json +++ b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json similarity index 92% rename from data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json rename to data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json index 3e7a0f6fa..f0d42b850 100644 --- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/6ae61644-7fe0-44c0-bd93-8ff4ab02bb19.json +++ b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json similarity index 92% rename from data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json rename to data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json index 745f99da6..43f986e70 100644 --- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/3c9c425a-ce4a-4958-9744-7f9490ed5729.json +++ b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json similarity index 92% rename from data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json rename to data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json index 478f77b1d..adaaa9403 100644 --- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/5a91486d-3fb2-4bff-89fd-dd70eb38e4d4.json +++ b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json similarity index 92% rename from data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json rename to data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json index 2039d0727..80b637746 100644 --- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8b8394c3-46d5-45b3-9486-e5e0eef57cf3.json +++ b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json similarity index 92% rename from data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json rename to data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json index 216532187..cc49de0c7 100644 --- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/f8044c74-3f1c-4562-a21c-e448061b2077.json +++ b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json similarity index 91% rename from data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json rename to data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json index 8f5d16956..bc304945b 100644 --- a/data/helm_classic/eleutherai/Pythia-12B/4abe3a0d-ba04-41f7-b107-59f11ff5697a.json +++ b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json similarity index 91% rename from data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json rename to data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json index 20ca16498..511816a71 100644 --- a/data/helm_classic/eleutherai/Pythia-6.9B/646adb7b-0761-4639-8776-83ea158bfca4.json +++ b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json similarity index 91% rename from data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json rename to data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json index d36f642d7..8d33e45b6 100644 --- a/data/helm_classic/google/Palmyra-X-43B/85cf6be2-d066-4e1b-b373-d53d3c922184.json +++ b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json similarity index 91% rename from data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json rename to data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json index 0f7601506..2a710defd 100644 --- a/data/helm_classic/google/T5-11B/52db5c6d-b54e-401a-880d-8ab41a394bc4.json +++ b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/google_T5-11B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/google_T5-11B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json similarity index 91% rename from data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json rename to data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json index 70193f3b0..bb571aece 100644 --- a/data/helm_classic/google/UL2-20B/68becad6-9455-4d3d-8d68-d1b4448598a1.json +++ b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/google_UL2-20B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/google_UL2-20B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json similarity index 91% rename from data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json rename to data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json index 385ac9b25..e1d9662a3 100644 --- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/519c67f5-26bc-4e67-bcc7-5b0030cea7c0.json +++ b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json similarity index 91% rename from data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json rename to data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json index 3de9b1fd2..b03d7afe6 100644 --- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/972bc5db-f536-42f9-aa51-83cc2f59b76a.json +++ b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json similarity index 91% rename from data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json rename to data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json index ac2da41ef..959b52195 100644 --- a/data/helm_classic/meta/LLaMA-13B/b2220101-56e0-49d9-a3d1-d3bec769ab97.json +++ b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-13B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_LLaMA-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json similarity index 91% rename from data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json rename to data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json index 1b33fd761..7f604e015 100644 --- a/data/helm_classic/meta/LLaMA-30B/96907b25-05c3-441b-afc4-69274c20bfc3.json +++ b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-30B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_LLaMA-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json similarity index 91% rename from data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json rename to data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json index a7d6351b2..ad8c1c451 100644 --- a/data/helm_classic/meta/LLaMA-65B/66e9c7c8-ec9a-4ed8-9d11-75625e6e3fd5.json +++ b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-65B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_LLaMA-65B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json similarity index 91% rename from data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json rename to data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json index 79b00a818..152b9e683 100644 --- a/data/helm_classic/meta/LLaMA-7B/70e9e156-6807-489b-b77a-367236614826.json +++ b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_LLaMA-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json similarity index 91% rename from data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json rename to data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json index 170095f5b..f2cd54e60 100644 --- a/data/helm_classic/meta/Llama-2-13B/e90cfb46-1173-4d22-9329-9bf57cdd5241.json +++ b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_Llama-2-13B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_Llama-2-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json similarity index 91% rename from data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json rename to data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json index 01e4b1b14..de031e670 100644 --- a/data/helm_classic/meta/Llama-2-70B/baedf4c4-4cfd-40a6-9148-4ff1c30e1dd7.json +++ b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_Llama-2-70B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_Llama-2-70B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json similarity index 91% rename from data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json rename to data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json index 40b71e7de..eac315fea 100644 --- a/data/helm_classic/meta/Llama-2-7B/7dd46eaa-bbc4-43e4-80d7-b3a644c90387.json +++ b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_Llama-2-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_Llama-2-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json similarity index 92% rename from data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json rename to data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json index f864b9222..63a0c348d 100644 --- a/data/helm_classic/meta/OPT-175B/ace4c4d3-42b3-4f38-89cc-01fd2c177af4.json +++ b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_OPT-175B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_OPT-175B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json similarity index 92% rename from data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json rename to data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json index ba62ce7d1..2f3d2ad96 100644 --- a/data/helm_classic/meta/OPT-66B/26ce27dd-6934-4e0f-bf1e-cd37ffc1a709.json +++ b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_OPT-66B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/meta_OPT-66B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json similarity index 91% rename from data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json rename to data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json index ce5dcad88..ddcfa82ef 100644 --- a/data/helm_classic/microsoft/TNLG-v2-530B/ecd21c26-cdc4-43b1-b933-4d970df9413a.json +++ b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json similarity index 91% rename from data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json rename to data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json index cfa4e8177..b3f527a04 100644 --- a/data/helm_classic/microsoft/TNLG-v2-6.7B/9d4350eb-cdf0-432f-b3b0-45f4832ca950.json +++ b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json similarity index 91% rename from data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json rename to data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json index 738857e58..1fd56a99f 100644 --- a/data/helm_classic/mistralai/Mistral-v0.1-7B/3bb95c3f-9e18-4ea7-ba7d-f8da48f5c39d.json +++ b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json similarity index 91% rename from data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json rename to data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json index 2580877d4..b0d1817b0 100644 --- a/data/helm_classic/mosaicml/MPT-30B/b277c87e-54b5-466f-97d7-35db4cd7b985.json +++ b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json similarity index 91% rename from data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json rename to data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json index a7cbf9856..771c4ac02 100644 --- a/data/helm_classic/mosaicml/MPT-Instruct-30B/270df23b-9e58-4259-a8ed-0d25b9c80b2a.json +++ b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json similarity index 92% rename from data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json rename to data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json index c135cdcfb..20a0f0d63 100644 --- a/data/helm_classic/openai/GPT-J-6B/1dc42e18-403a-4d6f-92f4-e6923c3b39b2.json +++ b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_GPT-J-6B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_GPT-J-6B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json similarity index 92% rename from data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json rename to data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json index d4e4c3e18..0c00ea05c 100644 --- a/data/helm_classic/openai/GPT-NeoX-20B/ef171b67-72a6-46d3-9eaf-4614ff474852.json +++ b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json similarity index 94% rename from data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json rename to data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json index ae351a8ab..5355ce78b 100644 --- a/data/helm_classic/openai/ada-350M/e6ea5f7e-0533-4a99-8638-1cc10c454238.json +++ b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_ada-350M/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_ada-350M/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json similarity index 94% rename from data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json rename to data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json index 4f76e1f1b..d3977fc36 100644 --- a/data/helm_classic/openai/babbage-1.3B/83c924fe-6318-4bad-adb0-8a81e5e28ee0.json +++ b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_babbage-1.3B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_babbage-1.3B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json similarity index 94% rename from data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json rename to data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json index 84c344282..fe011ca06 100644 --- a/data/helm_classic/openai/curie-6.7B/82e2c0e3-66f2-431f-b4b8-d2495970d998.json +++ b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_curie-6.7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_curie-6.7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json similarity index 94% rename from data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json rename to data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json index fb0de7bd8..b376d2873 100644 --- a/data/helm_classic/openai/davinci-175B/6bb2a07c-1ee5-48a1-9ee9-c159712491c3.json +++ b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_davinci-175B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_davinci-175B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json similarity index 91% rename from data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json rename to data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json index 582bc2e6a..8051b9b3e 100644 --- a/data/helm_classic/openai/gpt-3.5-turbo-0301/e18fbf9e-677c-49fb-ab76-475e8f605f01.json +++ b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json similarity index 91% rename from data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json rename to data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json index 5a9810e18..b2682e6f7 100644 --- a/data/helm_classic/openai/gpt-3.5-turbo-0613/039af363-0c5c-4e36-8396-cd57c7e4c1de.json +++ b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json similarity index 94% rename from data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json rename to data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json index 0e0d9602b..43f728bf2 100644 --- a/data/helm_classic/openai/text-ada-001/8ea1facb-260a-461d-9271-2c07b318c46f.json +++ b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-ada-001/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_text-ada-001/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json similarity index 94% rename from data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json rename to data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json index 734c00775..fbb4b5bb6 100644 --- a/data/helm_classic/openai/text-babbage-001/93007ac9-04c2-451d-abd2-2f235297747e.json +++ b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-babbage-001/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_text-babbage-001/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json similarity index 94% rename from data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json rename to data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json index ba874427c..4537bcc84 100644 --- a/data/helm_classic/openai/text-curie-001/b04e5f90-e46e-4d7a-a6a9-569bde072208.json +++ b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-curie-001/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_text-curie-001/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json similarity index 94% rename from data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json rename to data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json index 4555e0f80..0e9fa4947 100644 --- a/data/helm_classic/openai/text-davinci-002/933dc76f-45f0-48e0-93ae-3e19cff87c2a.json +++ b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-davinci-002/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_text-davinci-002/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json similarity index 94% rename from data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json rename to data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json index 7fd229e00..9ca831c0f 100644 --- a/data/helm_classic/openai/text-davinci-003/b8408a64-eb89-4337-8ee5-3c48e4e24437.json +++ b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-davinci-003/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/openai_text-davinci-003/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json similarity index 91% rename from data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json rename to data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json index f68731052..cf2a4b297 100644 --- a/data/helm_classic/stanford/Alpaca-7B/d5846321-0800-4ff9-b85c-53c8b4884ba5.json +++ b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json similarity index 91% rename from data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json rename to data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json index 85693f897..97f13c6d9 100644 --- a/data/helm_classic/tiiuae/Falcon-40B/baa5f92c-b626-4e09-a084-61ce7f5dee98.json +++ b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json similarity index 91% rename from data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json rename to data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json index e165123de..80c0ac18a 100644 --- a/data/helm_classic/tiiuae/Falcon-7B/9b648e90-8d3c-403d-9ad8-382ef0b212a6.json +++ b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json similarity index 91% rename from data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json rename to data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json index 3c1369c88..4b7c6b681 100644 --- a/data/helm_classic/tiiuae/Falcon-Instruct-40B/0692f762-337e-4c20-8ad6-feecc93882a3.json +++ b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json similarity index 91% rename from data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json rename to data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json index 19076bf3f..cd7efa818 100644 --- a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a91c9563-0756-4616-8a58-3c8000f73895.json +++ b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json rename to data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json index 90ced7618..f25c83f2e 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/3a329574-dcf6-4177-b37c-c495e6af6cc5.json +++ b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json rename to data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json index 858c06ee0..d4d85552c 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/9e662c1e-e77c-4fb3-b589-127683a4b2ca.json +++ b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json rename to data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json index e246416bd..9d60f7506 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/375140f6-bd3f-4b55-a35c-23de37254296.json +++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json similarity index 91% rename from data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json rename to data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json index 828ab9683..57ffafd39 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/021d0b25-8f58-47da-a58c-ac532a7972bf.json +++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json similarity index 91% rename from data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json rename to data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json index 0d4ab9c94..fe1ab40e2 100644 --- a/data/helm_classic/writer/InstructPalmyra-30B/9207fec4-d0c4-4f66-b917-f5ed57409215.json +++ b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json similarity index 91% rename from data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json rename to data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json index 4b439ab57..61a019ad2 100644 --- a/data/helm_classic/yandex/YaLM-100B/b04c8845-cccf-4856-9597-ab283bb2ec8d.json +++ b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/yandex_YaLM-100B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/yandex_YaLM-100B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json similarity index 91% rename from data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json rename to data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json index 67e0f75ce..04bdfa490 100644 --- a/data/helm_classic/zhipu-ai/GLM-130B/4bddbe5c-504a-4989-b4cb-8f4af5fccaf6.json +++ b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770830385.7573261", - "retrieved_timestamp": "1770830385.7573261", + "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -82,7 +82,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -91,7 +91,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -159,7 +159,7 @@ } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", "source_data": { "dataset_name": "BoolQ", "source_type": "url", @@ -168,7 +168,7 @@ ] }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -363,7 +363,7 @@ } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", "source_data": { "dataset_name": "NaturalQuestions (open-book)", "source_type": "url", @@ -372,7 +372,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -540,7 +540,7 @@ } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", "source_data": { "dataset_name": "QuAC", "source_type": "url", @@ -549,7 +549,7 @@ ] }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -642,7 +642,7 @@ } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", "source_data": { "dataset_name": "HellaSwag", "source_type": "url", @@ -651,7 +651,7 @@ ] }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -719,7 +719,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -728,7 +728,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -796,7 +796,7 @@ } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", "source_data": { "dataset_name": "TruthfulQA", "source_type": "url", @@ -805,7 +805,7 @@ ] }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -873,7 +873,7 @@ } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", "source_data": { "dataset_name": "MS MARCO (TREC)", "source_type": "url", @@ -882,7 +882,7 @@ ] }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1040,7 +1040,7 @@ } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", "source_data": { "dataset_name": "CNN/DailyMail", "source_type": "url", @@ -1049,7 +1049,7 @@ ] }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1172,7 +1172,7 @@ } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", "source_data": { "dataset_name": "XSUM", "source_type": "url", @@ -1181,7 +1181,7 @@ ] }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1304,7 +1304,7 @@ } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", "source_data": { "dataset_name": "IMDB", "source_type": "url", @@ -1313,7 +1313,7 @@ ] }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1406,7 +1406,7 @@ } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", "source_data": { "dataset_name": "CivilComments", "source_type": "url", @@ -1415,7 +1415,7 @@ ] }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1508,7 +1508,7 @@ } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", "source_data": { "dataset_name": "RAFT", "source_type": "url", @@ -1517,7 +1517,7 @@ ] }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json similarity index 73% rename from data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json rename to data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json index abd3e5e5b..31ab229b7 100644 --- a/data/helm_instruct/anthropic/claude-v1.3/0e30e895-aaf7-42d4-95db-7541d6b41c87.json +++ b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770830411.78817", - "retrieved_timestamp": "1770830411.78817", + "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -42,7 +42,7 @@ } }, { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "evaluation_name": "Anthropic RLHF dataset", "source_data": { "dataset_name": "Anthropic RLHF dataset", "source_type": "url", @@ -51,7 +51,7 @@ ] }, "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -90,7 +90,7 @@ } }, { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "evaluation_name": "Best ChatGPT Prompts", "source_data": { "dataset_name": "Best ChatGPT Prompts", "source_type": "url", @@ -99,7 +99,7 @@ ] }, "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -126,7 +126,7 @@ } }, { - "evaluation_name": "Koala test dataset - Harmlessness", + "evaluation_name": "Koala test dataset", "source_data": { "dataset_name": "Koala test dataset", "source_type": "url", @@ -135,7 +135,7 @@ ] }, "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Koala test dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -160,7 +160,7 @@ } }, { - "evaluation_name": "Open Assistant - Harmlessness", + "evaluation_name": "Open Assistant", "source_data": { "dataset_name": "Open Assistant", "source_type": "url", @@ -169,7 +169,7 @@ ] }, "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Open Assistant", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -195,7 +195,7 @@ } }, { - "evaluation_name": "Self Instruct - Harmlessness", + "evaluation_name": "Self Instruct", "source_data": { "dataset_name": "Self Instruct", "source_type": "url", @@ -204,7 +204,7 @@ ] }, "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Self Instruct", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -229,7 +229,7 @@ } }, { - "evaluation_name": "Vicuna - Harmlessness", + "evaluation_name": "Vicuna", "source_data": { "dataset_name": "Vicuna", "source_type": "url", @@ -238,7 +238,7 @@ ] }, "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Vicuna", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json similarity index 74% rename from data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json rename to data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json index 3aea06a21..2fd221159 100644 --- a/data/helm_instruct/cohere/command-xlarge-beta/4ef01f10-12bb-41f9-bd3b-36843a31d8ca.json +++ b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770830411.78817", - "retrieved_timestamp": "1770830411.78817", + "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -42,7 +42,7 @@ } }, { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "evaluation_name": "Anthropic RLHF dataset", "source_data": { "dataset_name": "Anthropic RLHF dataset", "source_type": "url", @@ -51,7 +51,7 @@ ] }, "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -90,7 +90,7 @@ } }, { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "evaluation_name": "Best ChatGPT Prompts", "source_data": { "dataset_name": "Best ChatGPT Prompts", "source_type": "url", @@ -99,7 +99,7 @@ ] }, "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -126,7 +126,7 @@ } }, { - "evaluation_name": "Koala test dataset - Harmlessness", + "evaluation_name": "Koala test dataset", "source_data": { "dataset_name": "Koala test dataset", "source_type": "url", @@ -135,7 +135,7 @@ ] }, "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Koala test dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -160,7 +160,7 @@ } }, { - "evaluation_name": "Open Assistant - Harmlessness", + "evaluation_name": "Open Assistant", "source_data": { "dataset_name": "Open Assistant", "source_type": "url", @@ -169,7 +169,7 @@ ] }, "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Open Assistant", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -195,7 +195,7 @@ } }, { - "evaluation_name": "Self Instruct - Harmlessness", + "evaluation_name": "Self Instruct", "source_data": { "dataset_name": "Self Instruct", "source_type": "url", @@ -204,7 +204,7 @@ ] }, "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Self Instruct", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -229,7 +229,7 @@ } }, { - "evaluation_name": "Vicuna - Harmlessness", + "evaluation_name": "Vicuna", "source_data": { "dataset_name": "Vicuna", "source_type": "url", @@ -238,7 +238,7 @@ ] }, "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Vicuna", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json similarity index 74% rename from data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json rename to data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json index 31fd0891a..23dfc4397 100644 --- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/8befd29c-a16d-4e05-a92f-00b621d45e03.json +++ b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770830411.78817", - "retrieved_timestamp": "1770830411.78817", + "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -42,7 +42,7 @@ } }, { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "evaluation_name": "Anthropic RLHF dataset", "source_data": { "dataset_name": "Anthropic RLHF dataset", "source_type": "url", @@ -51,7 +51,7 @@ ] }, "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -90,7 +90,7 @@ } }, { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "evaluation_name": "Best ChatGPT Prompts", "source_data": { "dataset_name": "Best ChatGPT Prompts", "source_type": "url", @@ -99,7 +99,7 @@ ] }, "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -126,7 +126,7 @@ } }, { - "evaluation_name": "Koala test dataset - Harmlessness", + "evaluation_name": "Koala test dataset", "source_data": { "dataset_name": "Koala test dataset", "source_type": "url", @@ -135,7 +135,7 @@ ] }, "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Koala test dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -160,7 +160,7 @@ } }, { - "evaluation_name": "Open Assistant - Harmlessness", + "evaluation_name": "Open Assistant", "source_data": { "dataset_name": "Open Assistant", "source_type": "url", @@ -169,7 +169,7 @@ ] }, "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Open Assistant", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -195,7 +195,7 @@ } }, { - "evaluation_name": "Self Instruct - Harmlessness", + "evaluation_name": "Self Instruct", "source_data": { "dataset_name": "Self Instruct", "source_type": "url", @@ -204,7 +204,7 @@ ] }, "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Self Instruct", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -229,7 +229,7 @@ } }, { - "evaluation_name": "Vicuna - Harmlessness", + "evaluation_name": "Vicuna", "source_data": { "dataset_name": "Vicuna", "source_type": "url", @@ -238,7 +238,7 @@ ] }, "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Vicuna", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json similarity index 73% rename from data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json rename to data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json index ac8e25cb0..9ad1bca2e 100644 --- a/data/helm_instruct/openai/gpt-4-0314/b2e193b8-215b-4e80-9d5a-df11f1dac88a.json +++ b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770830411.78817", - "retrieved_timestamp": "1770830411.78817", + "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", "source_metadata": { "source_name": "helm_instruct", "source_type": "documentation", @@ -42,7 +42,7 @@ } }, { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", + "evaluation_name": "Anthropic RLHF dataset", "source_data": { "dataset_name": "Anthropic RLHF dataset", "source_type": "url", @@ -51,7 +51,7 @@ ] }, "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -90,7 +90,7 @@ } }, { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", + "evaluation_name": "Best ChatGPT Prompts", "source_data": { "dataset_name": "Best ChatGPT Prompts", "source_type": "url", @@ -99,7 +99,7 @@ ] }, "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -126,7 +126,7 @@ } }, { - "evaluation_name": "Koala test dataset - Harmlessness", + "evaluation_name": "Koala test dataset", "source_data": { "dataset_name": "Koala test dataset", "source_type": "url", @@ -135,7 +135,7 @@ ] }, "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Koala test dataset", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -160,7 +160,7 @@ } }, { - "evaluation_name": "Open Assistant - Harmlessness", + "evaluation_name": "Open Assistant", "source_data": { "dataset_name": "Open Assistant", "source_type": "url", @@ -169,7 +169,7 @@ ] }, "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Open Assistant", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -195,7 +195,7 @@ } }, { - "evaluation_name": "Self Instruct - Harmlessness", + "evaluation_name": "Self Instruct", "source_data": { "dataset_name": "Self Instruct", "source_type": "url", @@ -204,7 +204,7 @@ ] }, "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Self Instruct", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -229,7 +229,7 @@ } }, { - "evaluation_name": "Vicuna - Harmlessness", + "evaluation_name": "Vicuna", "source_data": { "dataset_name": "Vicuna", "source_type": "url", @@ -238,7 +238,7 @@ ] }, "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", + "evaluation_description": "Harmlessness on Vicuna", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json similarity index 85% rename from data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json rename to data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json index 96c3d4d2d..946b7db3e 100644 --- a/data/helm_lite/01-ai/yi-34b/eedd0f38-6d26-4297-a469-291227ec6be6.json +++ b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/01-ai_yi-34b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/01-ai_yi-34b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json similarity index 85% rename from data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json rename to data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json index 497d98a06..28ba5fb69 100644 --- a/data/helm_lite/01-ai/yi-6b/74c47665-740f-4784-8a27-1c1d1c29bff8.json +++ b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/01-ai_yi-6b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/01-ai_yi-6b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json similarity index 85% rename from data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json rename to data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json index 7bea38ffb..9fe678bb4 100644 --- a/data/helm_lite/01-ai/yi-large-preview/8027b577-7f48-4df5-9879-bd45ac342f42.json +++ b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json similarity index 85% rename from data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json rename to data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json index 00a6f037c..fb405652b 100644 --- a/data/helm_lite/AlephAlpha/luminous-base/e6edad4c-f331-4c80-a7bf-a297d8ae3c89.json +++ b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json similarity index 86% rename from data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json rename to data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json index 215983cef..786a7e340 100644 --- a/data/helm_lite/AlephAlpha/luminous-extended/24e11e7b-15d6-4a09-9545-38486d0eb236.json +++ b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json similarity index 85% rename from data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json rename to data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json index 81f487c09..78da47969 100644 --- a/data/helm_lite/AlephAlpha/luminous-supreme/eb01d9c4-4d89-4451-b6d8-c1ea9d84a1d2.json +++ b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json similarity index 85% rename from data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json rename to data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json index ef3567598..2b870e958 100644 --- a/data/helm_lite/ai21/j2-grande/52ece7a1-9ef3-461e-9900-8c6cf0ad1844.json +++ b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_j2-grande/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/ai21_j2-grande/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json similarity index 85% rename from data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json rename to data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json index f39f9c93e..643b24001 100644 --- a/data/helm_lite/ai21/j2-jumbo/68713712-ae92-474b-84c0-1b8301538439.json +++ b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_j2-jumbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/ai21_j2-jumbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json similarity index 85% rename from data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json rename to data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json index d7dee0e9a..a07da123a 100644 --- a/data/helm_lite/ai21/jamba-1.5-large/15cc9411-6ea4-4f10-831f-23ff27fd5704.json +++ b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json similarity index 85% rename from data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json rename to data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json index f65e65120..9e0628c9d 100644 --- a/data/helm_lite/ai21/jamba-1.5-mini/3b308a74-7839-4f3c-b7bc-56a7d0a76bc5.json +++ b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json similarity index 85% rename from data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json rename to data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json index a3e5bda34..9e1241a8e 100644 --- a/data/helm_lite/ai21/jamba-instruct/1b2e4f70-f4f0-4241-951f-fb46435a5d3d.json +++ b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_jamba-instruct/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/ai21_jamba-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json similarity index 85% rename from data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json rename to data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json index 51375c00c..b68794dd1 100644 --- a/data/helm_lite/allenai/olmo-7b/078d812b-2198-4497-8fbe-06fb640fd86d.json +++ b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/allenai_olmo-7b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/allenai_olmo-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json similarity index 85% rename from data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json rename to data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json index 289dc9306..084734ba7 100644 --- a/data/helm_lite/amazon/nova-lite-v1_0/f928a53d-9d67-45e7-a871-04359c8162d5.json +++ b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json similarity index 85% rename from data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json rename to data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json index bcd94c63d..fb66c7744 100644 --- a/data/helm_lite/amazon/nova-micro-v1_0/741c4560-eb35-4edf-a48b-af29e743740a.json +++ b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json similarity index 85% rename from data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json rename to data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json index c8589b186..c7f9d86e2 100644 --- a/data/helm_lite/amazon/nova-pro-v1_0/4e8a8384-5f1d-4b76-be9b-385407332d6c.json +++ b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json similarity index 85% rename from data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json rename to data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json index 2f9d0f3e2..ab0989b58 100644 --- a/data/helm_lite/anthropic/claude-2.0/0684c1d2-ea43-4341-820c-09051f5e11f2.json +++ b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-2.0/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-2.0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json similarity index 85% rename from data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json rename to data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json index fe9f851b2..2adbb62af 100644 --- a/data/helm_lite/anthropic/claude-2.1/51821ca1-7eac-4094-abac-98b2484cc5a0.json +++ b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-2.1/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-2.1/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json similarity index 86% rename from data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json rename to data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json index 9eecf8a25..ff757a7ad 100644 --- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/8a0f5749-7f6a-4813-9c08-7283433c1337.json +++ b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json similarity index 86% rename from data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json rename to data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json index f3aab2968..2c4b0d7d1 100644 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/4697983d-a29a-484d-9268-7974117456e8.json +++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json similarity index 86% rename from data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json rename to data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json index 6a814b17d..4b9824f13 100644 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/60e33aa3-0593-42e6-9baa-8311746deca0.json +++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json similarity index 86% rename from data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json rename to data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json index 54328bd79..8eac62865 100644 --- a/data/helm_lite/anthropic/claude-3-haiku-20240307/2fe5cf18-696e-4095-9ac4-ec91c3df9d3d.json +++ b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json similarity index 86% rename from data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json rename to data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json index ad60ccaa3..d590c786e 100644 --- a/data/helm_lite/anthropic/claude-3-opus-20240229/9ad91ee2-7a64-4f94-9166-f2681777023b.json +++ b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json similarity index 86% rename from data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json rename to data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json index 35374c2f9..90baddbf7 100644 --- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/4d110c98-b39e-4e4c-8b4f-4cb2e3335565.json +++ b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json similarity index 86% rename from data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json rename to data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json index 7dbf7e9ee..c3ca60cb8 100644 --- a/data/helm_lite/anthropic/claude-instant-1.2/64a5133c-b31c-4dda-92c4-e8723d0b7ae0.json +++ b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json similarity index 85% rename from data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json rename to data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json index 04da077b3..da3e6b3b3 100644 --- a/data/helm_lite/anthropic/claude-v1.3/fe8a36b0-4361-461b-b310-656c54131fa6.json +++ b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json similarity index 85% rename from data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json rename to data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json index b4ccf63fd..a431f3338 100644 --- a/data/helm_lite/cohere/command-light/b5e4e9fb-dbc7-45ed-b50b-31fd6895eacf.json +++ b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command-light/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/cohere_command-light/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json similarity index 85% rename from data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json rename to data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json index e941df44c..d0f464767 100644 --- a/data/helm_lite/cohere/command-r-plus/67967a2a-5fb4-46e8-b1ec-eda1588d9086.json +++ b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command-r-plus/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/cohere_command-r-plus/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json similarity index 85% rename from data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json rename to data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json index 2314d1d0d..51821d155 100644 --- a/data/helm_lite/cohere/command-r/0331f7f8-f72e-4614-a2d5-b7c5005ddf82.json +++ b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command-r/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/cohere_command-r/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json similarity index 85% rename from data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json rename to data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json index 95909d3aa..488fa54b9 100644 --- a/data/helm_lite/cohere/command/ba5eea81-2120-4a20-8322-dfbd29cd197c.json +++ b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/cohere_command/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json similarity index 85% rename from data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json rename to data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json index 81dc83db8..9dc0aa32d 100644 --- a/data/helm_lite/databricks/dbrx-instruct/9dd66ede-da5c-4627-92ed-7057c9a2bea3.json +++ b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json similarity index 85% rename from data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json rename to data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json index 31cee265a..201ddf6e5 100644 --- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/801aa7da-90b2-48d1-ad3d-943b06bd437c.json +++ b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json similarity index 85% rename from data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json rename to data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json index cc64f30ee..b5f8e240f 100644 --- a/data/helm_lite/deepseek-ai/deepseek-v3/a58923ea-fa22-4c45-8327-efbe84c8a05d.json +++ b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json similarity index 85% rename from data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json rename to data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json index f6af740ee..eabdc0bbd 100644 --- a/data/helm_lite/google/gemini-1.0-pro-002/bab8d241-fad0-4230-b213-c2eeccc79f12.json +++ b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json similarity index 85% rename from data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json rename to data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json index 7c312bb83..991b81669 100644 --- a/data/helm_lite/google/gemini-1.5-flash-001/65e37589-ef26-46cd-a627-798af70e75bf.json +++ b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json similarity index 85% rename from data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json rename to data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json index 450dbafcb..725c639a2 100644 --- a/data/helm_lite/google/gemini-1.5-flash-002/f499f9c6-4c9a-43ba-b4c3-d094494a371c.json +++ b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json similarity index 85% rename from data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json rename to data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json index 653e006ee..8b7eab026 100644 --- a/data/helm_lite/google/gemini-1.5-pro-001/27a54446-57b2-4239-b768-7ab85dc94c54.json +++ b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json similarity index 85% rename from data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json rename to data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json index 64f712478..ebd3081fb 100644 --- a/data/helm_lite/google/gemini-1.5-pro-002/5de8a13e-a029-4a90-9a2d-c28a59212140.json +++ b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json similarity index 85% rename from data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json rename to data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json index 371c57f27..b96b71c0c 100644 --- a/data/helm_lite/google/gemini-2.0-flash-exp/f9643ce2-7347-401b-903e-fadcc5221f36.json +++ b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json similarity index 85% rename from data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json rename to data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json index 24f598da3..ea107cc9e 100644 --- a/data/helm_lite/google/gemma-2-27b-it/9932e430-2039-40b0-bc8f-ae2d833543e8.json +++ b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json similarity index 85% rename from data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json rename to data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json index 1e65ff610..1488d6604 100644 --- a/data/helm_lite/google/gemma-2-9b-it/dbd2e9bb-c2ca-4165-b229-d736a70721a5.json +++ b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json similarity index 85% rename from data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json rename to data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json index ee614ce44..810e32965 100644 --- a/data/helm_lite/google/gemma-7b/32c25b28-3b31-4bdf-8e6e-6039c6ddcade.json +++ b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemma-7b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_gemma-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json similarity index 85% rename from data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json rename to data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json index 7d7c944f0..30d0e3442 100644 --- a/data/helm_lite/google/text-bison@001/70ebaf81-e3eb-48e8-b2ef-716cde9a8d74.json +++ b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_text-bison@001/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_text-bison@001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json similarity index 85% rename from data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json rename to data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json index f19d99b14..d5841340f 100644 --- a/data/helm_lite/google/text-unicorn@001/07a367ee-2879-4ede-bbf8-33b24d682467.json +++ b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_text-unicorn@001/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/google_text-unicorn@001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json similarity index 85% rename from data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json rename to data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json index f38e87995..079c14180 100644 --- a/data/helm_lite/meta/llama-2-13b/fee914c7-d6bf-4d61-9f50-71bae5f11006.json +++ b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-2-13b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-2-13b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json similarity index 85% rename from data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json rename to data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json index b0d616c29..8faa07285 100644 --- a/data/helm_lite/meta/llama-2-70b/b0577066-231e-461b-bae8-b724b204397a.json +++ b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-2-70b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-2-70b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json similarity index 85% rename from data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json rename to data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json index 1e9aac924..bb2c02730 100644 --- a/data/helm_lite/meta/llama-2-7b/b79fe2e3-5eec-46f8-90a1-810781c8c46a.json +++ b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-2-7b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-2-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json similarity index 85% rename from data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json rename to data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json index f13ee8122..876850010 100644 --- a/data/helm_lite/meta/llama-3-70b/998616ef-5d1b-4c65-b6ad-23afc3630d5a.json +++ b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3-70b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3-70b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json similarity index 85% rename from data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json rename to data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json index 7f18bf5fd..87ab72524 100644 --- a/data/helm_lite/meta/llama-3-8b/fa010e81-1fc8-4a6e-a6e4-a9dd2c1f3f21.json +++ b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3-8b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3-8b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json similarity index 86% rename from data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json rename to data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json index 6ef4300e5..0bc6225d5 100644 --- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/25fde5e6-86b8-4a80-8f79-5946ef9999fc.json +++ b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json similarity index 86% rename from data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json rename to data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json index 8afc05c39..d57074cb2 100644 --- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/b955825d-ae7f-48c4-9dad-5ee78879737d.json +++ b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json similarity index 86% rename from data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json rename to data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json index e5cc6d55a..198d81cd2 100644 --- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/168da8dc-e4a7-4cf0-8384-5edb9f3fcb89.json +++ b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json similarity index 85% rename from data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json rename to data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json index 793304d91..722a6f050 100644 --- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/0807e353-9787-4ca0-8f7b-50d1bed2469e.json +++ b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json similarity index 85% rename from data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json rename to data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json index 615526ba3..8bef7c4e9 100644 --- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/0164b885-2c27-4eba-8e6f-e69156cb0dee.json +++ b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json similarity index 86% rename from data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json rename to data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json index 8116db8ba..cc4cca983 100644 --- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/08422837-51a0-45c9-9004-fc5d98dce462.json +++ b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json similarity index 85% rename from data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json rename to data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json index f3354af37..ebea32b6c 100644 --- a/data/helm_lite/meta/llama-65b/39f2c7f2-56d4-4349-95ae-374d34263f48.json +++ b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-65b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/meta_llama-65b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json similarity index 85% rename from data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json rename to data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json index 172c44cac..ee330c2d2 100644 --- a/data/helm_lite/microsoft/phi-2/0ed07d7b-4271-4c0d-8dea-c6fade2d2eb4.json +++ b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/microsoft_phi-2/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/microsoft_phi-2/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json similarity index 86% rename from data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json rename to data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json index c613f7fec..6d945026f 100644 --- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/75b8b916-5861-41f2-ad85-90ec3d8a3b5c.json +++ b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json similarity index 85% rename from data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json rename to data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json index f78b3f049..c7b88764b 100644 --- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/2de4b89a-3f3b-4d1d-ba85-030953a46956.json +++ b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json similarity index 86% rename from data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json rename to data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json index 97f8b3a1e..fd0f8e02b 100644 --- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/bd68405f-fe9a-448b-9c80-468c656594e5.json +++ b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json similarity index 85% rename from data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json rename to data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json index 30337d5a4..8f4801f23 100644 --- a/data/helm_lite/mistralai/mistral-7b-v0.1/4267fef1-3180-46e3-990e-0d1092ec4c18.json +++ b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json similarity index 86% rename from data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json rename to data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json index edea4050d..d8d60cc37 100644 --- a/data/helm_lite/mistralai/mistral-large-2402/002a34dc-39e5-451d-b2a8-b51bdb69a056.json +++ b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json similarity index 86% rename from data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json rename to data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json index d2dd06c67..d75c9932b 100644 --- a/data/helm_lite/mistralai/mistral-large-2407/5ae1d2b2-cf06-4622-81e6-803be31d8aa5.json +++ b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json similarity index 86% rename from data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json rename to data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json index cbbf76044..6bb7115e2 100644 --- a/data/helm_lite/mistralai/mistral-medium-2312/ad2beded-cec3-4b47-b8de-a32a3225fa66.json +++ b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json similarity index 86% rename from data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json rename to data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json index d1c6bf6d0..1f2cb2632 100644 --- a/data/helm_lite/mistralai/mistral-small-2402/eb901347-fc1f-4d8f-a70a-05a83e16658d.json +++ b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json similarity index 85% rename from data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json rename to data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json index d020ec2ef..e6bfd0332 100644 --- a/data/helm_lite/mistralai/mixtral-8x22b/9bb7be82-16f1-428e-9e2b-8e82f2bf0cc2.json +++ b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json similarity index 86% rename from data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json rename to data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json index ca92e5358..7bf0323b1 100644 --- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/042cacf9-0d8e-4ff7-8d9d-8152fa48fa7b.json +++ b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json similarity index 86% rename from data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json rename to data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json index 75b65c3cd..7fee5cb57 100644 --- a/data/helm_lite/mistralai/open-mistral-nemo-2407/d2d48e4a-0484-4f44-8108-2e689d7ca695.json +++ b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json similarity index 85% rename from data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json rename to data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json index c3db66d6f..878d33981 100644 --- a/data/helm_lite/openai/gpt-3.5-turbo-0613/e54ae605-a91d-47d7-a08d-67bd0ea5c606.json +++ b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json similarity index 85% rename from data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json rename to data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json index 3b34bbe84..7ff111f74 100644 --- a/data/helm_lite/openai/gpt-4-0613/15dccf75-871d-457b-8495-e0d03d550360.json +++ b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4-0613/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-4-0613/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json similarity index 85% rename from data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json rename to data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json index f80298de5..060ab8fb5 100644 --- a/data/helm_lite/openai/gpt-4-1106-preview/18fe5d30-bf36-405a-819e-1ecabda327ea.json +++ b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json similarity index 86% rename from data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json rename to data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json index 49bdd419a..dae83b652 100644 --- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/cc177e1d-6fde-4e6c-8469-5ae46dd8e06d.json +++ b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json similarity index 85% rename from data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json rename to data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json index ab2f778b6..c23053f17 100644 --- a/data/helm_lite/openai/gpt-4o-2024-05-13/cd199905-04a4-4745-b848-4f7bde97ca17.json +++ b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json similarity index 85% rename from data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json rename to data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json index 3d286d830..f8d7c3614 100644 --- a/data/helm_lite/openai/gpt-4o-2024-08-06/1c9a4dad-8abd-48eb-b058-dfe713d7bd6f.json +++ b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json similarity index 86% rename from data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json rename to data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json index 53ecaa7dc..3869cb246 100644 --- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bfd70aff-bf45-4f55-b730-4924afc181cd.json +++ b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json similarity index 85% rename from data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json rename to data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json index c90d2c5a2..f3294dd85 100644 --- a/data/helm_lite/openai/text-davinci-002/b6e08679-1bd7-42a1-9eee-98252de2c7c1.json +++ b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_text-davinci-002/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_text-davinci-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json similarity index 85% rename from data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json rename to data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json index 6f2c648e1..93f27df2b 100644 --- a/data/helm_lite/openai/text-davinci-003/22b411d5-a314-4b17-a9c7-c1af7ca7df33.json +++ b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_text-davinci-003/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/openai_text-davinci-003/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json similarity index 85% rename from data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json rename to data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json index 3b85e6b08..800f57826 100644 --- a/data/helm_lite/qwen/qwen1.5-110b-chat/f4f72859-eebd-4f9f-bfe8-bb3695ba544e.json +++ b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json similarity index 85% rename from data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json rename to data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json index 4df79c00f..c8749e5f5 100644 --- a/data/helm_lite/qwen/qwen1.5-14b/fb1bb023-16f6-4914-889b-6458d7ab1277.json +++ b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json similarity index 85% rename from data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json rename to data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json index 74a01181c..699c1515b 100644 --- a/data/helm_lite/qwen/qwen1.5-32b/8b572c10-3553-4e51-a321-bdb05996914b.json +++ b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json similarity index 85% rename from data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json rename to data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json index a056d0e42..8b347b68d 100644 --- a/data/helm_lite/qwen/qwen1.5-72b/6ffe921e-fd6d-4423-bd31-e1f7d34b2936.json +++ b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json similarity index 85% rename from data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json rename to data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json index 0757d65b1..b1bc89d92 100644 --- a/data/helm_lite/qwen/qwen1.5-7b/e0efe169-d28e-418e-a78c-9b04ec29aae2.json +++ b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json similarity index 85% rename from data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json rename to data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json index 2d8d0469d..58edcde03 100644 --- a/data/helm_lite/qwen/qwen2-72b-instruct/05b11551-b0ee-4c9c-bba5-fe9c9e8fb92b.json +++ b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json similarity index 86% rename from data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json rename to data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json index 6091d879d..3e08a0cdf 100644 --- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/983696ae-d7f3-48a4-b7a0-a42487728182.json +++ b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json similarity index 86% rename from data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json rename to data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json index a9b9ae2a3..3f844c281 100644 --- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/a969e516-adef-4839-9252-244c58ab3c67.json +++ b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -519,7 +519,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -528,7 +528,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -576,7 +576,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -585,7 +585,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json similarity index 86% rename from data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json rename to data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json index f7f93c913..09f377d89 100644 --- a/data/helm_lite/snowflake/snowflake-arctic-instruct/f122f9de-b1ce-40ea-8731-6c00c7af0498.json +++ b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json similarity index 85% rename from data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json rename to data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json index 65a14de91..2bf240f96 100644 --- a/data/helm_lite/tiiuae/falcon-40b/5c7982c5-3513-4ff2-9857-33a0db825376.json +++ b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json similarity index 85% rename from data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json rename to data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json index 62d1fae1c..9a704269c 100644 --- a/data/helm_lite/tiiuae/falcon-7b/4910859a-750c-4728-bf30-309e0e81690e.json +++ b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json similarity index 85% rename from data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json rename to data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json index 9e56dbbb6..1f111d01c 100644 --- a/data/helm_lite/upstage/solar-pro-241126/32f0532f-b504-492d-84d7-f541930edad0.json +++ b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -453,7 +453,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -462,7 +462,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -518,7 +518,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -527,7 +527,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -575,7 +575,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -584,7 +584,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json similarity index 85% rename from data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json rename to data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json index 2b000451d..8026be475 100644 --- a/data/helm_lite/writer/palmyra-x-004/04c187a3-4532-4523-b39d-19314d61c779.json +++ b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/writer_palmyra-x-004/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/writer_palmyra-x-004/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -111,7 +111,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -120,7 +120,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -201,7 +201,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -210,7 +210,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -261,7 +261,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -270,7 +270,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -327,7 +327,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -336,7 +336,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -398,7 +398,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -407,7 +407,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -457,7 +457,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -466,7 +466,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -523,7 +523,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -532,7 +532,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -580,7 +580,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -589,7 +589,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json similarity index 85% rename from data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json rename to data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json index fc600d1dc..5e5faf9fb 100644 --- a/data/helm_lite/writer/palmyra-x-v2/4440532c-9b49-4c9a-8bf4-f122531c54fa.json +++ b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json similarity index 85% rename from data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json rename to data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json index 3ac2641c0..c8073d254 100644 --- a/data/helm_lite/writer/palmyra-x-v3/bc21efa7-7ca2-42bb-8e67-83ed761ee0a0.json +++ b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770829788.2883599", - "retrieved_timestamp": "1770829788.2883599", + "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -52,7 +52,7 @@ } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", "source_data": { "dataset_name": "NarrativeQA", "source_type": "url", @@ -61,7 +61,7 @@ ] }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -109,7 +109,7 @@ } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", @@ -118,7 +118,7 @@ ] }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -198,7 +198,7 @@ } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", "source_data": { "dataset_name": "OpenbookQA", "source_type": "url", @@ -207,7 +207,7 @@ ] }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -258,7 +258,7 @@ } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", "source_data": { "dataset_name": "MMLU", "source_type": "url", @@ -267,7 +267,7 @@ ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -324,7 +324,7 @@ } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", "source_data": { "dataset_name": "MATH", "source_type": "url", @@ -333,7 +333,7 @@ ] }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,7 +394,7 @@ } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", "source_data": { "dataset_name": "GSM8K", "source_type": "url", @@ -403,7 +403,7 @@ ] }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -451,7 +451,7 @@ } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", "source_data": { "dataset_name": "LegalBench", "source_type": "url", @@ -460,7 +460,7 @@ ] }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -516,7 +516,7 @@ } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", "source_data": { "dataset_name": "MedQA", "source_type": "url", @@ -525,7 +525,7 @@ ] }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -573,7 +573,7 @@ } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", "source_data": { "dataset_name": "WMT 2014", "source_type": "url", @@ -582,7 +582,7 @@ ] }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json similarity index 90% rename from data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json rename to data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json index cdb3ca461..a5d4de71f 100644 --- a/data/helm_mmlu/01-ai/yi-34b/3769ecf8-4f5c-4bce-bac2-ab561c294ee4.json +++ b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json similarity index 90% rename from data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json rename to data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json index 1b8b7e56f..1f0a7e20f 100644 --- a/data/helm_mmlu/01-ai/yi-6b/6bb5b9d0-e6ba-4b58-bdf9-2ad80b6b643c.json +++ b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json similarity index 90% rename from data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json rename to data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json index 29bc15bb3..4838cda1c 100644 --- a/data/helm_mmlu/01-ai/yi-large-preview/3d0b3d68-a853-4989-a35e-83ac6722c2da.json +++ b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json similarity index 90% rename from data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json rename to data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json index 2bf971f25..45536e1a1 100644 --- a/data/helm_mmlu/ai21/jamba-1.5-large/ac77446d-4ae5-4cbc-b0d4-ddbfa293bbe7.json +++ b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json similarity index 90% rename from data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json rename to data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json index 7ffc27970..727c60261 100644 --- a/data/helm_mmlu/ai21/jamba-1.5-mini/517e8027-6edd-482b-86f3-33b6c41a9609.json +++ b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json similarity index 90% rename from data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json rename to data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json index 92ba45d60..3a25316d9 100644 --- a/data/helm_mmlu/ai21/jamba-instruct/f7c1c125-ad0f-4847-b880-4f705f1666c6.json +++ b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json similarity index 90% rename from data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json rename to data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json index e53150712..8bf036c64 100644 --- a/data/helm_mmlu/allenai/olmo-1.7-7b/5a0ba280-8a12-4735-9d92-4ed71ba395b4.json +++ b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json similarity index 90% rename from data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json rename to data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json index 301523f0f..2b8d4cdfb 100644 --- a/data/helm_mmlu/allenai/olmo-7b/73ccc6a6-e10d-4619-914f-26032cddf8da.json +++ b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json similarity index 90% rename from data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json rename to data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json index d80215b78..1bb99dccc 100644 --- a/data/helm_mmlu/amazon/nova-lite-v1_0/20c5af59-ff73-4731-9230-f92bb86e657b.json +++ b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json similarity index 90% rename from data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json rename to data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json index f28fc4ccf..ab9b8c843 100644 --- a/data/helm_mmlu/amazon/nova-micro-v1_0/fa4ea6f5-b04b-46f5-ab5d-5f3a8509dffc.json +++ b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json similarity index 90% rename from data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json rename to data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json index 66455ef1d..af30c4448 100644 --- a/data/helm_mmlu/amazon/nova-pro-v1_0/d30617fc-8d64-4070-b86a-c982025cfcea.json +++ b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json rename to data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json index 163a9d31a..c2616d7f8 100644 --- a/data/helm_mmlu/anthropic/claude-2.1/aa8cae95-cb75-4241-951c-25e2046042dd.json +++ b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json rename to data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json index edabc3b81..76628bf51 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/c88e4a03-22ae-4338-bf5f-36070814136a.json +++ b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json index 8d402d4fb..9d9557efc 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/4d15a55c-46c1-4741-bf33-9314a4e1d0b7.json +++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json index a435d5c4d..35be68aa6 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/ef404b2f-09d1-4ba5-a47f-efdfc631e78f.json +++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json rename to data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json index 66b68fa6b..969900aba 100644 --- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/097a8da1-f411-4359-8440-2ab06f4ae76c.json +++ b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json rename to data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json index 140c0db28..230be4291 100644 --- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/68130abd-1df5-4cd3-919a-2863e9f013c7.json +++ b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json rename to data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json index a6eb131df..dd7543ecb 100644 --- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/5d8d795a-d213-4b96-9b17-ad5fae6b3687.json +++ b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json similarity index 90% rename from data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json rename to data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json index 38a7ffacb..c9e9779b1 100644 --- a/data/helm_mmlu/anthropic/claude-instant-1.2/7908da03-f030-4c62-a121-c04bd94ea75e.json +++ b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json similarity index 90% rename from data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json rename to data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json index 4b85be9b2..6bebd236d 100644 --- a/data/helm_mmlu/cohere/command-r-plus/c6fdbf96-2500-4410-8fcd-268ea3e16062.json +++ b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json similarity index 90% rename from data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json rename to data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json index 90cbd571c..e82639d82 100644 --- a/data/helm_mmlu/cohere/command-r/537164c3-7b88-4543-b19d-370f55a25a66.json +++ b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/cohere_command-r/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/cohere_command-r/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json similarity index 90% rename from data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json rename to data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json index 753506525..d5f73b61f 100644 --- a/data/helm_mmlu/databricks/dbrx-instruct/0c539e26-8403-42db-acfc-7953dd80ae20.json +++ b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json similarity index 90% rename from data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json rename to data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json index f12e3799a..7ec071041 100644 --- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/364c7490-8bb1-4e7e-b485-fb3c2224da58.json +++ b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json similarity index 90% rename from data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json rename to data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json index 86fd9dec9..200a6e19c 100644 --- a/data/helm_mmlu/deepseek-ai/deepseek-v3/1a9167d2-882c-4582-b4e0-ac425896a317.json +++ b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json rename to data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json index 0184241c6..86096274a 100644 --- a/data/helm_mmlu/google/gemini-1.0-pro-001/8d8300a9-8dd4-49c1-b0a0-974c2ee51da5.json +++ b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json rename to data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json index 7baa6457e..7aac2d734 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-001/d7b559b9-d7d1-4749-a6cf-f28e82f7b659.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json rename to data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json index f095d6361..a87c94c3b 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-002/a94c9e13-dca7-4e02-a795-09d9274354d3.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json rename to data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json index fe99bd4e4..b8d59d877 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/75c8b20f-a4d4-4699-be79-f027bf7f0d69.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json rename to data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json index 4b9fc2846..0632aee68 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-001/264be7b4-08b7-40b6-a5e7-f3536f361450.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json rename to data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json index 47f80252d..d6a3ba87a 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-002/83b1a336-5cbe-48bd-b8cc-4a359ce9c911.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json similarity index 90% rename from data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json rename to data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json index 901c1dd01..de3a77c03 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/8a013eb3-0f21-4a50-8a53-4ba977951130.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json similarity index 90% rename from data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json rename to data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json index 0eda6b6b1..6b53de064 100644 --- a/data/helm_mmlu/google/gemini-2.0-flash-exp/7b081a40-7cb6-4405-b842-3db95f290dfa.json +++ b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json similarity index 90% rename from data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json rename to data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json index 142296fc4..8720cc062 100644 --- a/data/helm_mmlu/google/gemma-2-27b/54185b53-9891-43c6-8f93-09ff02b728d8.json +++ b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json similarity index 90% rename from data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json rename to data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json index 6f84fd47f..2007b06df 100644 --- a/data/helm_mmlu/google/gemma-2-9b/884c194d-6519-4bd4-8add-6514e593c514.json +++ b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json similarity index 90% rename from data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json rename to data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json index ac525859f..963d13c9a 100644 --- a/data/helm_mmlu/google/gemma-7b/a80cbd76-bcf8-4174-b0b3-346fae152bdb.json +++ b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemma-7b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_gemma-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json similarity index 90% rename from data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json rename to data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json index b20dbe54d..c0271bcb3 100644 --- a/data/helm_mmlu/google/text-bison@001/5f105986-aa7d-4858-91bc-cece9d0085ba.json +++ b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_text-bison@001/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_text-bison@001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json similarity index 90% rename from data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json rename to data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json index 7b3536f41..42c5040aa 100644 --- a/data/helm_mmlu/google/text-unicorn@001/528b7b4e-c8a6-4387-bd98-497a3316029d.json +++ b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json similarity index 90% rename from data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json rename to data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json index a786ac0dd..453cd8b3a 100644 --- a/data/helm_mmlu/meta/llama-2-13b/96eb34db-66bd-4945-8b4c-a8c1394fe56a.json +++ b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json similarity index 90% rename from data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json rename to data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json index bd988b6d8..aa6a9caa2 100644 --- a/data/helm_mmlu/meta/llama-2-70b/961e917b-0e67-462c-b9d0-0fe4b4b85beb.json +++ b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json similarity index 90% rename from data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json rename to data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json index b29cd7460..0649e7329 100644 --- a/data/helm_mmlu/meta/llama-2-7b/59a85d2c-16ce-4ed4-bc65-f6898127fa57.json +++ b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json similarity index 90% rename from data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json rename to data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json index d46d7f50a..4f09a5ee3 100644 --- a/data/helm_mmlu/meta/llama-3-70b/16a8b446-51fc-4c23-9231-46ee16c1c0a8.json +++ b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json similarity index 90% rename from data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json rename to data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json index 31dfddc02..83f907e80 100644 --- a/data/helm_mmlu/meta/llama-3-8b/f4de7e58-7060-440b-8f6f-1f79d7499d1e.json +++ b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json similarity index 90% rename from data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json rename to data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json index 64eb43090..c4ce37e9d 100644 --- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/5337a1bc-9b9d-42be-9fb3-5e733f08ffc3.json +++ b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json similarity index 90% rename from data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json rename to data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json index 149eb0100..0e4b849f9 100644 --- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/dc6aa933-67e4-4811-b3e2-e5200c002abe.json +++ b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json similarity index 90% rename from data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json rename to data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json index 46bd04117..6c1d661d4 100644 --- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/5f9758a3-fd6d-4598-930a-9c01420d05e8.json +++ b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json similarity index 90% rename from data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json rename to data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json index 187d1c6a7..599cd6855 100644 --- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/7592c0d8-a06c-4189-81a1-dbf794d22c8b.json +++ b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json similarity index 90% rename from data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json rename to data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json index 9625c1e16..f14700c78 100644 --- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/83c0e8e3-087c-4d61-9153-e571b4971871.json +++ b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json similarity index 90% rename from data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json rename to data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json index 8effae129..faf8ae128 100644 --- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/c4640f7a-b1cd-41bc-8ea5-7fe2d944dced.json +++ b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json similarity index 90% rename from data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json rename to data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json index 07027b7fb..95bd9f1b8 100644 --- a/data/helm_mmlu/microsoft/phi-2/5baac093-babb-41cd-a2f4-985d0b91be37.json +++ b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/microsoft_phi-2/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/microsoft_phi-2/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json similarity index 90% rename from data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json rename to data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json index 6ce22179c..f1d62a268 100644 --- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/1bf54088-ba12-45b4-8f80-63d5c38f58f6.json +++ b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json similarity index 90% rename from data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json rename to data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json index 7278b002a..bbe3afca0 100644 --- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/5ed0a970-200f-4f23-9623-e714afa49ddf.json +++ b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json similarity index 90% rename from data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json rename to data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json index 886ff1732..e788149e1 100644 --- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/e7fd06a6-65e5-4f88-8e86-c513f78e31db.json +++ b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json similarity index 90% rename from data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json rename to data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json index 935804d7f..5ca508d3b 100644 --- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/ac047aef-008f-4c87-a6d5-4f331ebf5c53.json +++ b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json similarity index 90% rename from data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json rename to data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json index bc72ce600..6b7873124 100644 --- a/data/helm_mmlu/mistralai/mistral-large-2402/ec2e5e1c-3bbb-46e2-a1f2-b90ee891dabc.json +++ b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json similarity index 90% rename from data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json rename to data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json index 272dc142d..58aa6a379 100644 --- a/data/helm_mmlu/mistralai/mistral-large-2407/7517b6c9-c613-416c-aadb-39fd6d252da7.json +++ b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json similarity index 90% rename from data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json rename to data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json index 607dc1e03..457d9ed2a 100644 --- a/data/helm_mmlu/mistralai/mistral-small-2402/85fbf0b1-b32f-40d9-8144-d56e6f74bd4b.json +++ b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json similarity index 90% rename from data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json rename to data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json index 401d4b7c8..c7ab33c35 100644 --- a/data/helm_mmlu/mistralai/mixtral-8x22b/df568c3c-8a5c-4455-836d-c980d7f5ea5c.json +++ b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json similarity index 90% rename from data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json rename to data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json index b88295eb7..3ed7c6104 100644 --- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/96e24977-ca6d-402c-bfd8-62be4cd9b902.json +++ b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json similarity index 90% rename from data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json rename to data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json index 5a436d9c9..e5aec6b67 100644 --- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e5b2636a-8438-40c0-9f89-9f35585bf740.json +++ b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json similarity index 90% rename from data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json rename to data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json index 5923a61b0..e429d6dbc 100644 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/f3259d92-3c95-4b78-81ae-f7f4b80aec63.json +++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json similarity index 90% rename from data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json rename to data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json index c62c20e9c..92faf2169 100644 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/5ba23a34-4232-487f-b3e9-326d776135be.json +++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json similarity index 90% rename from data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json rename to data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json index 9877671a2..6ccc418f3 100644 --- a/data/helm_mmlu/openai/gpt-4-0613/5bc1a462-f753-4259-91c3-a549491b2986.json +++ b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json similarity index 90% rename from data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json rename to data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json index 448f5bbca..610be9719 100644 --- a/data/helm_mmlu/openai/gpt-4-1106-preview/16ec8b67-4da3-4dbc-aa24-35f4a22d148e.json +++ b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json similarity index 90% rename from data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json rename to data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json index aefe21734..a348a9fb9 100644 --- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/dd13eca9-9e4e-44cb-87b7-6daebdbb0fac.json +++ b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json similarity index 90% rename from data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json rename to data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json index efc7bbe5a..76ba53d53 100644 --- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/2ca11d4c-52e6-49ea-a5cb-238c0313c483.json +++ b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json similarity index 90% rename from data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json rename to data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json index fe9568710..2d538eb02 100644 --- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/de400624-6c2e-47af-b851-54c4075c30ee.json +++ b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json similarity index 90% rename from data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json rename to data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json index 681eae3b7..7753003a8 100644 --- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/34441b3b-4d66-444c-af85-ca0666a48ed4.json +++ b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json similarity index 90% rename from data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json rename to data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json index 6667a05bb..4b924f5af 100644 --- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/eecf5e40-9110-47ea-a72b-9ba587b96e30.json +++ b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json similarity index 90% rename from data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json rename to data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json index ce5d472c6..9bfc87f91 100644 --- a/data/helm_mmlu/qwen/qwen1.5-14b/f26fb123-c214-4d18-aea8-b05b4ea1819b.json +++ b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json similarity index 90% rename from data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json rename to data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json index ff8059b60..d1a9f19e1 100644 --- a/data/helm_mmlu/qwen/qwen1.5-32b/30cfb5af-c6d8-4ec6-a1a7-1d9f0aab82d5.json +++ b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json similarity index 90% rename from data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json rename to data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json index c69a6d09c..94c5e4e80 100644 --- a/data/helm_mmlu/qwen/qwen1.5-72b/b152cd5c-cbc0-48f4-ba37-16878c3afba1.json +++ b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json similarity index 90% rename from data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json rename to data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json index 8651674c9..166da7894 100644 --- a/data/helm_mmlu/qwen/qwen1.5-7b/dac223e9-3073-46f9-924b-c5a6408f5da9.json +++ b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json similarity index 90% rename from data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json rename to data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json index 89026d1dc..6f8b955e0 100644 --- a/data/helm_mmlu/qwen/qwen2-72b-instruct/a7a218ff-7afe-417c-ac39-cf305d592d56.json +++ b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json similarity index 90% rename from data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json rename to data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json index 1a03b982a..a61d620fd 100644 --- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/2e165735-43b8-4317-9cde-35aa4b5bcb26.json +++ b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json similarity index 90% rename from data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json rename to data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json index 032da16a1..c045e519d 100644 --- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/15c25bc5-7b1e-4771-bda2-fd04d74e1463.json +++ b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json similarity index 90% rename from data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json rename to data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json index 5482f32f0..0afa77758 100644 --- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/26036c7c-e981-46e8-b5e9-dcd7d116af70.json +++ b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json similarity index 90% rename from data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json rename to data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json index b71ad83e6..2c0cfc48a 100644 --- a/data/helm_mmlu/upstage/solar-pro-241126/b3269e4e-98a7-4795-8ef3-fc87774a54b7.json +++ b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json similarity index 90% rename from data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json rename to data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json index 734ce34f3..c204b253d 100644 --- a/data/helm_mmlu/writer/palmyra-x-004/284fde9f-8570-4e6d-9190-e52d8723fe57.json +++ b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json similarity index 90% rename from data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json rename to data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json index 0e5669e0a..2eef769c8 100644 --- a/data/helm_mmlu/writer/palmyra-x-v3/fdd7ef1e-4e7d-40a0-9cff-4a5e59648805.json +++ b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json @@ -1,7 +1,7 @@ { "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770830564.5477738", - "retrieved_timestamp": "1770830564.5477738", + "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -16,16 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", "source_data": { - "dataset_name": "MMLU All Subjects", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,16 +194,16 @@ } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", "source_data": { - "dataset_name": "Abstract Algebra", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -256,16 +256,16 @@ } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", "source_data": { - "dataset_name": "Anatomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -318,16 +318,16 @@ } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", "source_data": { - "dataset_name": "College Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -530,16 +530,16 @@ } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", "source_data": { - "dataset_name": "Computer Security", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -592,16 +592,16 @@ } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", "source_data": { - "dataset_name": "Econometrics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -654,16 +654,16 @@ } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", "source_data": { - "dataset_name": "Global Facts", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -716,16 +716,16 @@ } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", "source_data": { - "dataset_name": "Jurisprudence", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -778,16 +778,16 @@ } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", "source_data": { - "dataset_name": "Philosophy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -840,16 +840,16 @@ } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", "source_data": { - "dataset_name": "Professional Psychology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -992,16 +992,16 @@ } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", "source_data": { - "dataset_name": "Us Foreign Policy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1054,16 +1054,16 @@ } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", "source_data": { - "dataset_name": "Astronomy", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1116,16 +1116,16 @@ } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", "source_data": { - "dataset_name": "Business Ethics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1178,16 +1178,16 @@ } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", "source_data": { - "dataset_name": "Clinical Knowledge", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1240,16 +1240,16 @@ } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", "source_data": { - "dataset_name": "Conceptual Physics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1302,16 +1302,16 @@ } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", "source_data": { - "dataset_name": "Electrical Engineering", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1364,16 +1364,16 @@ } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", "source_data": { - "dataset_name": "Elementary Mathematics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1426,16 +1426,16 @@ } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", "source_data": { - "dataset_name": "Formal Logic", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1488,16 +1488,16 @@ } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", "source_data": { - "dataset_name": "High School World History", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1940,16 +1940,16 @@ } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", "source_data": { - "dataset_name": "Human Sexuality", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2032,16 +2032,16 @@ } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", "source_data": { - "dataset_name": "International Law", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2094,16 +2094,16 @@ } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", "source_data": { - "dataset_name": "Logical Fallacies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2156,16 +2156,16 @@ } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", "source_data": { - "dataset_name": "Machine Learning", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2218,16 +2218,16 @@ } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", "source_data": { - "dataset_name": "Management", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2280,16 +2280,16 @@ } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", "source_data": { - "dataset_name": "Marketing", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2342,16 +2342,16 @@ } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", "source_data": { - "dataset_name": "Medical Genetics", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2404,16 +2404,16 @@ } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "Miscellaneous", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2466,16 +2466,16 @@ } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "Moral Scenarios", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2558,16 +2558,16 @@ } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "Nutrition", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2620,16 @@ } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "Prehistory", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2682,16 +2682,16 @@ } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "Public Relations", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2744,16 +2744,16 @@ } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "Security Studies", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2806,16 +2806,16 @@ } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "Sociology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2868,16 +2868,16 @@ } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "Virology", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2930,16 +2930,16 @@ } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "World Religions", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py index acb5330d9..a3a7aca96 100644 --- a/utils/helm/adapter.py +++ b/utils/helm/adapter.py @@ -193,9 +193,22 @@ def convert( or "instruct" in leaderboard_name.lower() ) + if full_eval_name.lower().startswith('mean'): + metric_name = None + dataset_name = leaderboard_name + evaluation_name = full_eval_name + else: + dataset_name, metric_name = full_eval_name.split(' - ', 1) + evaluation_name = dataset_name + + if metric_name: + evaluation_description = f'{metric_name} on {dataset_name}' + else: + evaluation_description = header.get("description") + if is_new_metric: metric_config = MetricConfig( - evaluation_description=header.get("description"), + evaluation_description=evaluation_description, lower_is_better=header.get("lower_is_better", False), min_score=( 0.0 if mins[col_idx] >= 0 else math.floor(mins[col_idx]) @@ -206,13 +219,10 @@ def convert( score_type=ScoreType.continuous, ) - if full_eval_name.lower().startswith('mean'): - dataset_name = leaderboard_name - else: - dataset_name = full_eval_name.split(' - ')[0] + source_dataset_name = leaderboard_name if leaderboard_name.lower() == 'helm_mmlu' else dataset_name source_data = SourceDataUrl( - dataset_name=dataset_name, + dataset_name=source_dataset_name, source_type='url', url=[args.source_data_url] ) @@ -224,7 +234,7 @@ def convert( ) model_results[model_name][short_name] = EvaluationResult( - evaluation_name=full_eval_name, + evaluation_name=evaluation_name, source_data=source_data, metric_config=metric_config, score_details=ScoreDetails(